001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.junit.Assert.assertTrue; 021 022import java.io.IOException; 023import java.util.NavigableMap; 024import java.util.TreeMap; 025import java.util.concurrent.CountDownLatch; 026import java.util.concurrent.TimeUnit; 027import org.apache.hadoop.conf.Configuration; 028import org.apache.hadoop.fs.FileSystem; 029import org.apache.hadoop.fs.Path; 030import org.apache.hadoop.hbase.CellScanner; 031import org.apache.hadoop.hbase.ChoreService; 032import org.apache.hadoop.hbase.CoordinatedStateManager; 033import org.apache.hadoop.hbase.HBaseClassTestRule; 034import org.apache.hadoop.hbase.HBaseTestingUtility; 035import org.apache.hadoop.hbase.HConstants; 036import org.apache.hadoop.hbase.Server; 037import org.apache.hadoop.hbase.ServerName; 038import org.apache.hadoop.hbase.TableName; 039import org.apache.hadoop.hbase.client.ClusterConnection; 040import org.apache.hadoop.hbase.client.Connection; 041import org.apache.hadoop.hbase.client.Durability; 042import org.apache.hadoop.hbase.client.Put; 043import org.apache.hadoop.hbase.regionserver.wal.DamagedWALException; 044import org.apache.hadoop.hbase.regionserver.wal.FSHLog; 045import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener; 046import org.apache.hadoop.hbase.testclassification.MediumTests; 047import org.apache.hadoop.hbase.testclassification.RegionServerTests; 048import org.apache.hadoop.hbase.util.Bytes; 049import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper; 050import org.apache.hadoop.hbase.util.Threads; 051import org.apache.hadoop.hbase.wal.WAL; 052import org.apache.hadoop.hbase.wal.WALEdit; 053import org.apache.hadoop.hbase.wal.WALKey; 054import org.apache.hadoop.hbase.wal.WALKeyImpl; 055import org.apache.hadoop.hbase.wal.WALProvider.Writer; 056import org.apache.hadoop.hbase.zookeeper.ZKWatcher; 057import org.junit.After; 058import org.junit.Assert; 059import org.junit.Before; 060import org.junit.ClassRule; 061import org.junit.Rule; 062import org.junit.Test; 063import org.junit.experimental.categories.Category; 064import org.junit.rules.TestName; 065import org.mockito.Mockito; 066import org.slf4j.Logger; 067import org.slf4j.LoggerFactory; 068 069import org.apache.hbase.thirdparty.com.google.common.io.Closeables; 070 071/** 072 * Testing for lock up of FSHLog. 073 */ 074@Category({ RegionServerTests.class, MediumTests.class }) 075public class TestWALLockup { 076 077 @ClassRule 078 public static final HBaseClassTestRule CLASS_RULE = 079 HBaseClassTestRule.forClass(TestWALLockup.class); 080 081 private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class); 082 083 @Rule 084 public TestName name = new TestName(); 085 086 private static final String COLUMN_FAMILY = "MyCF"; 087 private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY); 088 089 HRegion region = null; 090 private static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 091 private static Configuration CONF ; 092 private String dir; 093 094 // Test names 095 protected TableName tableName; 096 097 @Before 098 public void setup() throws IOException { 099 CONF = TEST_UTIL.getConfiguration(); 100 // Disable block cache. 101 CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f); 102 dir = TEST_UTIL.getDataTestDir("TestHRegion").toString(); 103 tableName = TableName.valueOf(name.getMethodName()); 104 } 105 106 @After 107 public void tearDown() throws Exception { 108 EnvironmentEdgeManagerTestHelper.reset(); 109 LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir()); 110 TEST_UTIL.cleanupTestDir(); 111 } 112 113 private String getName() { 114 return name.getMethodName(); 115 } 116 117 // A WAL that we can have throw exceptions when a flag is set. 118 private static final class DodgyFSLog extends FSHLog { 119 // Set this when want the WAL to start throwing exceptions. 120 volatile boolean throwException = false; 121 122 // Latch to hold up processing until after another operation has had time to run. 123 CountDownLatch latch = new CountDownLatch(1); 124 125 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) 126 throws IOException { 127 super(fs, root, logDir, conf); 128 } 129 130 @Override 131 protected void afterCreatingZigZagLatch() { 132 // If throwException set, then append will throw an exception causing the WAL to be 133 // rolled. We'll come in here. Hold up processing until a sync can get in before 134 // the zigzag has time to complete its setup and get its own sync in. This is what causes 135 // the lock up we've seen in production. 136 if (throwException) { 137 try { 138 LOG.info("LATCHED"); 139 // So, timing can have it that the test can run and the bad flush below happens 140 // before we get here. In this case, we'll be stuck waiting on this latch but there 141 // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint... 142 // because all WALs have rolled. In this case, just give up on test. 143 if (!this.latch.await(5, TimeUnit.SECONDS)) { 144 LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!"); 145 } 146 } catch (InterruptedException e) { 147 } 148 } 149 } 150 151 @Override 152 protected void beforeWaitOnSafePoint() { 153 if (throwException) { 154 LOG.info("COUNTDOWN"); 155 // Don't countdown latch until someone waiting on it otherwise, the above 156 // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll 157 // be stuck; test won't go down 158 while (this.latch.getCount() <= 0) 159 Threads.sleep(1); 160 this.latch.countDown(); 161 } 162 } 163 164 @Override 165 protected Writer createWriterInstance(Path path) throws IOException { 166 final Writer w = super.createWriterInstance(path); 167 return new Writer() { 168 @Override 169 public void close() throws IOException { 170 w.close(); 171 } 172 173 @Override 174 public void sync(boolean forceSync) throws IOException { 175 if (throwException) { 176 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); 177 } 178 w.sync(forceSync); 179 } 180 181 @Override 182 public void append(Entry entry) throws IOException { 183 if (throwException) { 184 throw new IOException("FAKE! Failed to replace a bad datanode...APPEND"); 185 } 186 w.append(entry); 187 } 188 189 @Override 190 public long getLength() { 191 return w.getLength(); 192 } 193 194 @Override 195 public long getSyncedLength() { 196 return w.getSyncedLength(); 197 } 198 }; 199 } 200 } 201 202 /** 203 * Reproduce locking up that happens when we get an inopportune sync during setup for 204 * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because 205 * it is locked up. 206 * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to 207 * set up a dodgy WAL that will throw an exception when we go to append to it. 208 */ 209 @Test 210 public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException { 211 // Mocked up server and regionserver services. Needed below. 212 RegionServerServices services = Mockito.mock(RegionServerServices.class); 213 Mockito.when(services.getConfiguration()).thenReturn(CONF); 214 Mockito.when(services.isStopped()).thenReturn(false); 215 Mockito.when(services.isAborted()).thenReturn(false); 216 217 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. 218 FileSystem fs = FileSystem.get(CONF); 219 Path rootDir = new Path(dir + getName()); 220 DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); 221 dodgyWAL.init(); 222 Path originalWAL = dodgyWAL.getCurrentFileName(); 223 // I need a log roller running. 224 LogRoller logRoller = new LogRoller(services); 225 logRoller.addWAL(dodgyWAL); 226 // There is no 'stop' once a logRoller is running.. it just dies. 227 logRoller.start(); 228 // Now get a region and start adding in edits. 229 final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL); 230 byte [] bytes = Bytes.toBytes(getName()); 231 NavigableMap<byte[], Integer> scopes = new TreeMap<>( 232 Bytes.BYTES_COMPARATOR); 233 scopes.put(COLUMN_FAMILY_BYTES, 0); 234 MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(); 235 try { 236 // First get something into memstore. Make a Put and then pull the Cell out of it. Will 237 // manage append and sync carefully in below to manufacture hang. We keep adding same 238 // edit. WAL subsystem doesn't care. 239 Put put = new Put(bytes); 240 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes); 241 WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(), 242 TableName.META_TABLE_NAME, System.currentTimeMillis(), mvcc, scopes); 243 WALEdit edit = new WALEdit(); 244 CellScanner CellScanner = put.cellScanner(); 245 assertTrue(CellScanner.advance()); 246 edit.add(CellScanner.current()); 247 // Put something in memstore and out in the WAL. Do a big number of appends so we push 248 // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL 249 for (int i = 0; i < 1000; i++) { 250 region.put(put); 251 } 252 // Set it so we start throwing exceptions. 253 LOG.info("SET throwing of exception on append"); 254 dodgyWAL.throwException = true; 255 // This append provokes a WAL roll request 256 dodgyWAL.appendData(region.getRegionInfo(), key, edit); 257 boolean exception = false; 258 try { 259 dodgyWAL.sync(false); 260 } catch (Exception e) { 261 exception = true; 262 } 263 assertTrue("Did not get sync exception", exception); 264 265 // Get a memstore flush going too so we have same hung profile as up in the issue over 266 // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up 267 // by the zigzaglatch waiting on syncs to come home. 268 Thread t = new Thread ("Flusher") { 269 @Override 270 public void run() { 271 try { 272 if (region.getMemStoreDataSize() <= 0) { 273 throw new IOException("memstore size=" + region.getMemStoreDataSize()); 274 } 275 region.flush(false); 276 } catch (IOException e) { 277 // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later 278 // when roll completes. 279 LOG.info("In flush", e); 280 } 281 LOG.info("Exiting"); 282 }; 283 }; 284 t.setDaemon(true); 285 t.start(); 286 // Wait until 287 while (dodgyWAL.latch.getCount() > 0) { 288 Threads.sleep(1); 289 } 290 // Now assert I got a new WAL file put in place even though loads of errors above. 291 assertTrue(originalWAL != dodgyWAL.getCurrentFileName()); 292 // Can I append to it? 293 dodgyWAL.throwException = false; 294 try { 295 region.put(put); 296 } catch (Exception e) { 297 LOG.info("In the put", e); 298 } 299 } finally { 300 // To stop logRoller, its server has to say it is stopped. 301 Mockito.when(services.isStopped()).thenReturn(true); 302 Closeables.close(logRoller, true); 303 try { 304 if (region != null) { 305 region.close(); 306 } 307 if (dodgyWAL != null) { 308 dodgyWAL.close(); 309 } 310 } catch (Exception e) { 311 LOG.info("On way out", e); 312 } 313 } 314 } 315 316 /** 317 * 318 * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in 319 * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317. 320 */ 321 @Test 322 public void testRingBufferEventHandlerStuckWhenSyncFailed() 323 throws IOException, InterruptedException { 324 325 // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down 326 class DodgyFSLog extends FSHLog { 327 328 private volatile boolean zigZagCreated = false; 329 330 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) 331 throws IOException { 332 super(fs, root, logDir, conf); 333 } 334 335 @Override 336 protected void afterCreatingZigZagLatch() { 337 zigZagCreated = true; 338 // Sleep a while to wait for RingBufferEventHandler to get stuck first. 339 try { 340 Thread.sleep(3000); 341 } catch (InterruptedException ignore) { 342 } 343 } 344 345 @Override 346 protected long getSequenceOnRingBuffer() { 347 return super.getSequenceOnRingBuffer(); 348 } 349 350 protected void publishSyncOnRingBufferAndBlock(long sequence) { 351 try { 352 super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false)); 353 Assert.fail("Expect an IOException here."); 354 } catch (IOException ignore) { 355 // Here, we will get an IOException. 356 } 357 } 358 359 @Override 360 protected Writer createWriterInstance(Path path) throws IOException { 361 final Writer w = super.createWriterInstance(path); 362 return new Writer() { 363 @Override 364 public void close() throws IOException { 365 w.close(); 366 } 367 368 @Override 369 public void sync(boolean forceSync) throws IOException { 370 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); 371 } 372 373 @Override 374 public void append(Entry entry) throws IOException { 375 w.append(entry); 376 } 377 378 @Override 379 public long getLength() { 380 return w.getLength(); 381 } 382 383 @Override 384 public long getSyncedLength() { 385 return w.getSyncedLength(); 386 } 387 }; 388 } 389 } 390 391 // Mocked up server and regionserver services. Needed below. 392 RegionServerServices services = Mockito.mock(RegionServerServices.class); 393 Mockito.when(services.getConfiguration()).thenReturn(CONF); 394 Mockito.when(services.isStopped()).thenReturn(false); 395 Mockito.when(services.isAborted()).thenReturn(false); 396 397 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. 398 FileSystem fs = FileSystem.get(CONF); 399 Path rootDir = new Path(dir + getName()); 400 final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); 401 dodgyWAL.init(); 402 // I need a log roller running. 403 LogRoller logRoller = new LogRoller(services); 404 logRoller.addWAL(dodgyWAL); 405 // There is no 'stop' once a logRoller is running.. it just dies. 406 logRoller.start(); 407 408 try { 409 final long seqForSync = dodgyWAL.getSequenceOnRingBuffer(); 410 411 // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch 412 // in LogRoller. 413 // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event, 414 // as long as HBASE-14317 hasn't be fixed. 415 LOG.info("Trigger log roll for creating a ZigZagLatch."); 416 logRoller.requestRollAll(); 417 418 while (!dodgyWAL.zigZagCreated) { 419 Thread.sleep(10); 420 } 421 422 // Send a sync event for RingBufferEventHandler, 423 // and it gets blocked in RingBufferEventHandler.attainSafePoint 424 LOG.info("Send sync for RingBufferEventHandler"); 425 Thread syncThread = new Thread() { 426 @Override 427 public void run() { 428 dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync); 429 } 430 }; 431 // Sync in another thread to avoid reset SyncFuture again. 432 syncThread.start(); 433 syncThread.join(); 434 435 try { 436 LOG.info("Call sync for testing whether RingBufferEventHandler is hanging."); 437 dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this test. 438 Assert.fail("Expect an IOException here."); 439 } catch (IOException ignore) { 440 } 441 442 } finally { 443 // To stop logRoller, its server has to say it is stopped. 444 Mockito.when(services.isStopped()).thenReturn(true); 445 if (logRoller != null) { 446 logRoller.close(); 447 } 448 if (dodgyWAL != null) { 449 dodgyWAL.close(); 450 } 451 } 452 } 453 454 455 static class DummyServer implements Server { 456 private Configuration conf; 457 private String serverName; 458 private boolean isAborted = false; 459 460 public DummyServer(Configuration conf, String serverName) { 461 this.conf = conf; 462 this.serverName = serverName; 463 } 464 465 @Override 466 public Configuration getConfiguration() { 467 return conf; 468 } 469 470 @Override 471 public ZKWatcher getZooKeeper() { 472 return null; 473 } 474 475 @Override 476 public CoordinatedStateManager getCoordinatedStateManager() { 477 return null; 478 } 479 480 @Override 481 public ClusterConnection getConnection() { 482 return null; 483 } 484 485 @Override 486 public ServerName getServerName() { 487 return ServerName.valueOf(this.serverName); 488 } 489 490 @Override 491 public void abort(String why, Throwable e) { 492 LOG.info("Aborting " + serverName); 493 this.isAborted = true; 494 } 495 496 @Override 497 public boolean isAborted() { 498 return this.isAborted; 499 } 500 501 @Override 502 public void stop(String why) { 503 this.isAborted = true; 504 } 505 506 @Override 507 public boolean isStopped() { 508 return this.isAborted; 509 } 510 511 @Override 512 public ChoreService getChoreService() { 513 return null; 514 } 515 516 @Override 517 public ClusterConnection getClusterConnection() { 518 return null; 519 } 520 521 @Override 522 public FileSystem getFileSystem() { 523 return null; 524 } 525 526 @Override 527 public boolean isStopping() { 528 return false; 529 } 530 531 @Override 532 public Connection createConnection(Configuration conf) throws IOException { 533 return null; 534 } 535 } 536 537 static class DummyWALActionsListener implements WALActionsListener { 538 539 @Override 540 public void visitLogEntryBeforeWrite(WALKey logKey, WALEdit logEdit) 541 throws IOException { 542 if (logKey.getTableName().getNameAsString().equalsIgnoreCase("sleep")) { 543 try { 544 Thread.sleep(1000); 545 } catch (InterruptedException e) { 546 e.printStackTrace(); 547 } 548 } 549 if (logKey.getTableName().getNameAsString() 550 .equalsIgnoreCase("DamagedWALException")) { 551 throw new DamagedWALException("Failed appending"); 552 } 553 } 554 555 } 556 557 /** 558 * @return A region on which you must call {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} 559 * when done. 560 */ 561 private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, 562 Configuration conf, WAL wal) throws IOException { 563 ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 564 0, null, MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT); 565 return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false, 566 Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES); 567 } 568}