001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.junit.Assert.assertTrue; 021 022import java.io.IOException; 023import java.util.NavigableMap; 024import java.util.TreeMap; 025import java.util.concurrent.CountDownLatch; 026import java.util.concurrent.TimeUnit; 027import org.apache.hadoop.conf.Configuration; 028import org.apache.hadoop.fs.FileSystem; 029import org.apache.hadoop.fs.Path; 030import org.apache.hadoop.hbase.CellScanner; 031import org.apache.hadoop.hbase.HBaseClassTestRule; 032import org.apache.hadoop.hbase.HBaseTestingUtil; 033import org.apache.hadoop.hbase.HConstants; 034import org.apache.hadoop.hbase.TableName; 035import org.apache.hadoop.hbase.client.Durability; 036import org.apache.hadoop.hbase.client.Put; 037import org.apache.hadoop.hbase.regionserver.wal.FSHLog; 038import org.apache.hadoop.hbase.testclassification.MediumTests; 039import org.apache.hadoop.hbase.testclassification.RegionServerTests; 040import org.apache.hadoop.hbase.util.Bytes; 041import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 042import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper; 043import org.apache.hadoop.hbase.util.Threads; 044import org.apache.hadoop.hbase.wal.WAL; 045import org.apache.hadoop.hbase.wal.WALEdit; 046import org.apache.hadoop.hbase.wal.WALKeyImpl; 047import org.apache.hadoop.hbase.wal.WALProvider.Writer; 048import org.junit.After; 049import org.junit.Assert; 050import org.junit.Before; 051import org.junit.ClassRule; 052import org.junit.Rule; 053import org.junit.Test; 054import org.junit.experimental.categories.Category; 055import org.junit.rules.TestName; 056import org.mockito.Mockito; 057import org.slf4j.Logger; 058import org.slf4j.LoggerFactory; 059 060import org.apache.hbase.thirdparty.com.google.common.io.Closeables; 061 062/** 063 * Testing for lock up of FSHLog. 064 */ 065@Category({ RegionServerTests.class, MediumTests.class }) 066public class TestWALLockup { 067 068 @ClassRule 069 public static final HBaseClassTestRule CLASS_RULE = 070 HBaseClassTestRule.forClass(TestWALLockup.class); 071 072 private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class); 073 074 @Rule 075 public TestName name = new TestName(); 076 077 private static final String COLUMN_FAMILY = "MyCF"; 078 private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY); 079 080 private static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 081 private static Configuration CONF; 082 private String dir; 083 084 // Test names 085 protected TableName tableName; 086 087 @Before 088 public void setup() throws IOException { 089 CONF = TEST_UTIL.getConfiguration(); 090 // Disable block cache. 091 CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f); 092 dir = TEST_UTIL.getDataTestDir("TestHRegion").toString(); 093 tableName = TableName.valueOf(name.getMethodName()); 094 } 095 096 @After 097 public void tearDown() throws Exception { 098 EnvironmentEdgeManagerTestHelper.reset(); 099 LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir()); 100 TEST_UTIL.cleanupTestDir(); 101 } 102 103 private String getName() { 104 return name.getMethodName(); 105 } 106 107 // A WAL that we can have throw exceptions when a flag is set. 108 private static final class DodgyFSLog extends FSHLog { 109 // Set this when want the WAL to start throwing exceptions. 110 volatile boolean throwException = false; 111 112 // Latch to hold up processing until after another operation has had time to run. 113 CountDownLatch latch = new CountDownLatch(1); 114 115 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) 116 throws IOException { 117 super(fs, root, logDir, conf); 118 } 119 120 @Override 121 protected void afterCreatingZigZagLatch() { 122 // If throwException set, then append will throw an exception causing the WAL to be 123 // rolled. We'll come in here. Hold up processing until a sync can get in before 124 // the zigzag has time to complete its setup and get its own sync in. This is what causes 125 // the lock up we've seen in production. 126 if (throwException) { 127 try { 128 LOG.info("LATCHED"); 129 // So, timing can have it that the test can run and the bad flush below happens 130 // before we get here. In this case, we'll be stuck waiting on this latch but there 131 // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint... 132 // because all WALs have rolled. In this case, just give up on test. 133 if (!this.latch.await(5, TimeUnit.SECONDS)) { 134 LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!"); 135 } 136 } catch (InterruptedException e) { 137 } 138 } 139 } 140 141 @Override 142 protected void beforeWaitOnSafePoint() { 143 if (throwException) { 144 LOG.info("COUNTDOWN"); 145 // Don't countdown latch until someone waiting on it otherwise, the above 146 // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll 147 // be stuck; test won't go down 148 while (this.latch.getCount() <= 0) 149 Threads.sleep(1); 150 this.latch.countDown(); 151 } 152 } 153 154 @Override 155 protected Writer createWriterInstance(Path path) throws IOException { 156 final Writer w = super.createWriterInstance(path); 157 return new Writer() { 158 @Override 159 public void close() throws IOException { 160 w.close(); 161 } 162 163 @Override 164 public void sync(boolean forceSync) throws IOException { 165 if (throwException) { 166 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); 167 } 168 w.sync(forceSync); 169 } 170 171 @Override 172 public void append(Entry entry) throws IOException { 173 if (throwException) { 174 throw new IOException("FAKE! Failed to replace a bad datanode...APPEND"); 175 } 176 w.append(entry); 177 } 178 179 @Override 180 public long getLength() { 181 return w.getLength(); 182 } 183 184 @Override 185 public long getSyncedLength() { 186 return w.getSyncedLength(); 187 } 188 }; 189 } 190 } 191 192 /** 193 * Reproduce locking up that happens when we get an inopportune sync during setup for zigzaglatch 194 * wait. See HBASE-14317. If below is broken, we will see this test timeout because it is locked 195 * up. 196 * <p> 197 * First I need to set up some mocks for Server and RegionServerServices. I also need to set up a 198 * dodgy WAL that will throw an exception when we go to append to it. 199 */ 200 @Test 201 public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException { 202 // Mocked up server and regionserver services. Needed below. 203 RegionServerServices services = Mockito.mock(RegionServerServices.class); 204 Mockito.when(services.getConfiguration()).thenReturn(CONF); 205 Mockito.when(services.isStopped()).thenReturn(false); 206 Mockito.when(services.isAborted()).thenReturn(false); 207 208 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. 209 FileSystem fs = FileSystem.get(CONF); 210 Path rootDir = new Path(dir + getName()); 211 DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); 212 dodgyWAL.init(); 213 Path originalWAL = dodgyWAL.getCurrentFileName(); 214 // I need a log roller running. 215 LogRoller logRoller = new LogRoller(services); 216 logRoller.addWAL(dodgyWAL); 217 // There is no 'stop' once a logRoller is running.. it just dies. 218 logRoller.start(); 219 // Now get a region and start adding in edits. 220 final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL); 221 byte[] bytes = Bytes.toBytes(getName()); 222 NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR); 223 scopes.put(COLUMN_FAMILY_BYTES, 0); 224 MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(); 225 try { 226 // First get something into memstore. Make a Put and then pull the Cell out of it. Will 227 // manage append and sync carefully in below to manufacture hang. We keep adding same 228 // edit. WAL subsystem doesn't care. 229 Put put = new Put(bytes); 230 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes); 231 WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(), 232 TableName.META_TABLE_NAME, EnvironmentEdgeManager.currentTime(), mvcc, scopes); 233 WALEdit edit = new WALEdit(); 234 CellScanner CellScanner = put.cellScanner(); 235 assertTrue(CellScanner.advance()); 236 edit.add(CellScanner.current()); 237 // Put something in memstore and out in the WAL. Do a big number of appends so we push 238 // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL 239 for (int i = 0; i < 1000; i++) { 240 region.put(put); 241 } 242 // Set it so we start throwing exceptions. 243 LOG.info("SET throwing of exception on append"); 244 dodgyWAL.throwException = true; 245 // This append provokes a WAL roll request 246 dodgyWAL.appendData(region.getRegionInfo(), key, edit); 247 boolean exception = false; 248 try { 249 dodgyWAL.sync(false); 250 } catch (Exception e) { 251 exception = true; 252 } 253 assertTrue("Did not get sync exception", exception); 254 255 // Get a memstore flush going too so we have same hung profile as up in the issue over 256 // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up 257 // by the zigzaglatch waiting on syncs to come home. 258 Thread t = new Thread("Flusher") { 259 @Override 260 public void run() { 261 try { 262 if (region.getMemStoreDataSize() <= 0) { 263 throw new IOException("memstore size=" + region.getMemStoreDataSize()); 264 } 265 region.flush(false); 266 } catch (IOException e) { 267 // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later 268 // when roll completes. 269 LOG.info("In flush", e); 270 } 271 LOG.info("Exiting"); 272 } 273 }; 274 t.setDaemon(true); 275 t.start(); 276 // Wait until 277 while (dodgyWAL.latch.getCount() > 0) { 278 Threads.sleep(1); 279 } 280 // Now assert I got a new WAL file put in place even though loads of errors above. 281 assertTrue(originalWAL != dodgyWAL.getCurrentFileName()); 282 // Can I append to it? 283 dodgyWAL.throwException = false; 284 try { 285 region.put(put); 286 } catch (Exception e) { 287 LOG.info("In the put", e); 288 } 289 } finally { 290 // To stop logRoller, its server has to say it is stopped. 291 Mockito.when(services.isStopped()).thenReturn(true); 292 Closeables.close(logRoller, true); 293 try { 294 if (region != null) { 295 region.close(); 296 } 297 if (dodgyWAL != null) { 298 dodgyWAL.close(); 299 } 300 } catch (Exception e) { 301 LOG.info("On way out", e); 302 } 303 } 304 } 305 306 /** 307 * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in 308 * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317. 309 */ 310 @Test 311 public void testRingBufferEventHandlerStuckWhenSyncFailed() 312 throws IOException, InterruptedException { 313 314 // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down 315 class DodgyFSLog extends FSHLog { 316 317 private volatile boolean zigZagCreated = false; 318 319 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) 320 throws IOException { 321 super(fs, root, logDir, conf); 322 } 323 324 @Override 325 protected void afterCreatingZigZagLatch() { 326 zigZagCreated = true; 327 // Sleep a while to wait for RingBufferEventHandler to get stuck first. 328 try { 329 Thread.sleep(3000); 330 } catch (InterruptedException ignore) { 331 } 332 } 333 334 @Override 335 protected long getSequenceOnRingBuffer() { 336 return super.getSequenceOnRingBuffer(); 337 } 338 339 protected void publishSyncOnRingBufferAndBlock(long sequence) { 340 try { 341 super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false)); 342 Assert.fail("Expect an IOException here."); 343 } catch (IOException ignore) { 344 // Here, we will get an IOException. 345 } 346 } 347 348 @Override 349 protected Writer createWriterInstance(Path path) throws IOException { 350 final Writer w = super.createWriterInstance(path); 351 return new Writer() { 352 @Override 353 public void close() throws IOException { 354 w.close(); 355 } 356 357 @Override 358 public void sync(boolean forceSync) throws IOException { 359 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); 360 } 361 362 @Override 363 public void append(Entry entry) throws IOException { 364 w.append(entry); 365 } 366 367 @Override 368 public long getLength() { 369 return w.getLength(); 370 } 371 372 @Override 373 public long getSyncedLength() { 374 return w.getSyncedLength(); 375 } 376 }; 377 } 378 } 379 380 // Mocked up server and regionserver services. Needed below. 381 RegionServerServices services = Mockito.mock(RegionServerServices.class); 382 Mockito.when(services.getConfiguration()).thenReturn(CONF); 383 Mockito.when(services.isStopped()).thenReturn(false); 384 Mockito.when(services.isAborted()).thenReturn(false); 385 386 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. 387 FileSystem fs = FileSystem.get(CONF); 388 Path rootDir = new Path(dir + getName()); 389 final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); 390 // I need a log roller running. 391 LogRoller logRoller = new LogRoller(services); 392 logRoller.addWAL(dodgyWAL); 393 // There is no 'stop' once a logRoller is running.. it just dies. 394 logRoller.start(); 395 396 try { 397 final long seqForSync = dodgyWAL.getSequenceOnRingBuffer(); 398 399 // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch 400 // in LogRoller. 401 // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event, 402 // as long as HBASE-14317 hasn't be fixed. 403 LOG.info("Trigger log roll for creating a ZigZagLatch."); 404 logRoller.requestRollAll(); 405 406 while (!dodgyWAL.zigZagCreated) { 407 Thread.sleep(10); 408 } 409 410 // Send a sync event for RingBufferEventHandler, 411 // and it gets blocked in RingBufferEventHandler.attainSafePoint 412 LOG.info("Send sync for RingBufferEventHandler"); 413 Thread syncThread = new Thread() { 414 @Override 415 public void run() { 416 dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync); 417 } 418 }; 419 // Sync in another thread to avoid reset SyncFuture again. 420 syncThread.start(); 421 syncThread.join(); 422 423 try { 424 LOG.info("Call sync for testing whether RingBufferEventHandler is hanging."); 425 dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this 426 // test. 427 Assert.fail("Expect an IOException here."); 428 } catch (IOException ignore) { 429 } 430 431 } finally { 432 // To stop logRoller, its server has to say it is stopped. 433 Mockito.when(services.isStopped()).thenReturn(true); 434 if (logRoller != null) { 435 logRoller.close(); 436 } 437 if (dodgyWAL != null) { 438 dodgyWAL.close(); 439 } 440 } 441 } 442 443 /** 444 * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)} 445 * when done. 446 */ 447 private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, 448 Configuration conf, WAL wal) throws IOException { 449 ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null, 450 MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT); 451 return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false, 452 Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES); 453 } 454}