001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.junit.Assert.assertTrue; 021 022import java.io.IOException; 023import java.util.NavigableMap; 024import java.util.TreeMap; 025import java.util.concurrent.CountDownLatch; 026import java.util.concurrent.TimeUnit; 027import org.apache.hadoop.conf.Configuration; 028import org.apache.hadoop.fs.FileSystem; 029import org.apache.hadoop.fs.Path; 030import org.apache.hadoop.hbase.CellScanner; 031import org.apache.hadoop.hbase.ChoreService; 032import org.apache.hadoop.hbase.CoordinatedStateManager; 033import org.apache.hadoop.hbase.HBaseClassTestRule; 034import org.apache.hadoop.hbase.HBaseTestingUtility; 035import org.apache.hadoop.hbase.HConstants; 036import org.apache.hadoop.hbase.Server; 037import org.apache.hadoop.hbase.ServerName; 038import org.apache.hadoop.hbase.TableName; 039import org.apache.hadoop.hbase.client.ClusterConnection; 040import org.apache.hadoop.hbase.client.Connection; 041import org.apache.hadoop.hbase.client.Durability; 042import org.apache.hadoop.hbase.client.Put; 043import org.apache.hadoop.hbase.regionserver.wal.FSHLog; 044import org.apache.hadoop.hbase.testclassification.MediumTests; 045import org.apache.hadoop.hbase.testclassification.RegionServerTests; 046import org.apache.hadoop.hbase.util.Bytes; 047import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 048import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper; 049import org.apache.hadoop.hbase.util.Threads; 050import org.apache.hadoop.hbase.wal.WAL; 051import org.apache.hadoop.hbase.wal.WALEdit; 052import org.apache.hadoop.hbase.wal.WALKeyImpl; 053import org.apache.hadoop.hbase.wal.WALProvider.Writer; 054import org.apache.hadoop.hbase.zookeeper.ZKWatcher; 055import org.junit.After; 056import org.junit.Assert; 057import org.junit.Before; 058import org.junit.ClassRule; 059import org.junit.Rule; 060import org.junit.Test; 061import org.junit.experimental.categories.Category; 062import org.junit.rules.TestName; 063import org.mockito.Mockito; 064import org.slf4j.Logger; 065import org.slf4j.LoggerFactory; 066 067import org.apache.hbase.thirdparty.com.google.common.io.Closeables; 068 069/** 070 * Testing for lock up of FSHLog. 071 */ 072@Category({ RegionServerTests.class, MediumTests.class }) 073public class TestWALLockup { 074 075 @ClassRule 076 public static final HBaseClassTestRule CLASS_RULE = 077 HBaseClassTestRule.forClass(TestWALLockup.class); 078 079 private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class); 080 081 @Rule 082 public TestName name = new TestName(); 083 084 private static final String COLUMN_FAMILY = "MyCF"; 085 private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY); 086 087 HRegion region = null; 088 private static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 089 private static Configuration CONF; 090 private String dir; 091 092 // Test names 093 protected TableName tableName; 094 095 @Before 096 public void setup() throws IOException { 097 CONF = TEST_UTIL.getConfiguration(); 098 // Disable block cache. 099 CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f); 100 dir = TEST_UTIL.getDataTestDir("TestHRegion").toString(); 101 tableName = TableName.valueOf(name.getMethodName()); 102 } 103 104 @After 105 public void tearDown() throws Exception { 106 EnvironmentEdgeManagerTestHelper.reset(); 107 LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir()); 108 TEST_UTIL.cleanupTestDir(); 109 } 110 111 private String getName() { 112 return name.getMethodName(); 113 } 114 115 // A WAL that we can have throw exceptions when a flag is set. 116 private static final class DodgyFSLog extends FSHLog { 117 // Set this when want the WAL to start throwing exceptions. 118 volatile boolean throwException = false; 119 120 // Latch to hold up processing until after another operation has had time to run. 121 CountDownLatch latch = new CountDownLatch(1); 122 123 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) 124 throws IOException { 125 super(fs, root, logDir, conf); 126 } 127 128 @Override 129 protected void afterCreatingZigZagLatch() { 130 // If throwException set, then append will throw an exception causing the WAL to be 131 // rolled. We'll come in here. Hold up processing until a sync can get in before 132 // the zigzag has time to complete its setup and get its own sync in. This is what causes 133 // the lock up we've seen in production. 134 if (throwException) { 135 try { 136 LOG.info("LATCHED"); 137 // So, timing can have it that the test can run and the bad flush below happens 138 // before we get here. In this case, we'll be stuck waiting on this latch but there 139 // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint... 140 // because all WALs have rolled. In this case, just give up on test. 141 if (!this.latch.await(5, TimeUnit.SECONDS)) { 142 LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!"); 143 } 144 } catch (InterruptedException e) { 145 } 146 } 147 } 148 149 @Override 150 protected void beforeWaitOnSafePoint() { 151 if (throwException) { 152 LOG.info("COUNTDOWN"); 153 // Don't countdown latch until someone waiting on it otherwise, the above 154 // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll 155 // be stuck; test won't go down 156 while (this.latch.getCount() <= 0) 157 Threads.sleep(1); 158 this.latch.countDown(); 159 } 160 } 161 162 @Override 163 protected Writer createWriterInstance(Path path) throws IOException { 164 final Writer w = super.createWriterInstance(path); 165 return new Writer() { 166 @Override 167 public void close() throws IOException { 168 w.close(); 169 } 170 171 @Override 172 public void sync(boolean forceSync) throws IOException { 173 if (throwException) { 174 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); 175 } 176 w.sync(forceSync); 177 } 178 179 @Override 180 public void append(Entry entry) throws IOException { 181 if (throwException) { 182 throw new IOException("FAKE! Failed to replace a bad datanode...APPEND"); 183 } 184 w.append(entry); 185 } 186 187 @Override 188 public long getLength() { 189 return w.getLength(); 190 } 191 192 @Override 193 public long getSyncedLength() { 194 return w.getSyncedLength(); 195 } 196 }; 197 } 198 } 199 200 /** 201 * Reproduce locking up that happens when we get an inopportune sync during setup for zigzaglatch 202 * wait. See HBASE-14317. If below is broken, we will see this test timeout because it is locked 203 * up. 204 * <p> 205 * First I need to set up some mocks for Server and RegionServerServices. I also need to set up a 206 * dodgy WAL that will throw an exception when we go to append to it. 207 */ 208 @Test 209 public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException { 210 // Mocked up server and regionserver services. Needed below. 211 RegionServerServices services = Mockito.mock(RegionServerServices.class); 212 Mockito.when(services.getConfiguration()).thenReturn(CONF); 213 Mockito.when(services.isStopped()).thenReturn(false); 214 Mockito.when(services.isAborted()).thenReturn(false); 215 216 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. 217 FileSystem fs = FileSystem.get(CONF); 218 Path rootDir = new Path(dir + getName()); 219 DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); 220 dodgyWAL.init(); 221 Path originalWAL = dodgyWAL.getCurrentFileName(); 222 // I need a log roller running. 223 LogRoller logRoller = new LogRoller(services); 224 logRoller.addWAL(dodgyWAL); 225 // There is no 'stop' once a logRoller is running.. it just dies. 226 logRoller.start(); 227 // Now get a region and start adding in edits. 228 final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL); 229 byte[] bytes = Bytes.toBytes(getName()); 230 NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR); 231 scopes.put(COLUMN_FAMILY_BYTES, 0); 232 MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(); 233 try { 234 // First get something into memstore. Make a Put and then pull the Cell out of it. Will 235 // manage append and sync carefully in below to manufacture hang. We keep adding same 236 // edit. WAL subsystem doesn't care. 237 Put put = new Put(bytes); 238 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes); 239 WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(), 240 TableName.META_TABLE_NAME, EnvironmentEdgeManager.currentTime(), mvcc, scopes); 241 WALEdit edit = new WALEdit(); 242 CellScanner CellScanner = put.cellScanner(); 243 assertTrue(CellScanner.advance()); 244 edit.add(CellScanner.current()); 245 // Put something in memstore and out in the WAL. Do a big number of appends so we push 246 // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL 247 for (int i = 0; i < 1000; i++) { 248 region.put(put); 249 } 250 // Set it so we start throwing exceptions. 251 LOG.info("SET throwing of exception on append"); 252 dodgyWAL.throwException = true; 253 // This append provokes a WAL roll request 254 dodgyWAL.appendData(region.getRegionInfo(), key, edit); 255 boolean exception = false; 256 try { 257 dodgyWAL.sync(false); 258 } catch (Exception e) { 259 exception = true; 260 } 261 assertTrue("Did not get sync exception", exception); 262 263 // Get a memstore flush going too so we have same hung profile as up in the issue over 264 // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up 265 // by the zigzaglatch waiting on syncs to come home. 266 Thread t = new Thread("Flusher") { 267 @Override 268 public void run() { 269 try { 270 if (region.getMemStoreDataSize() <= 0) { 271 throw new IOException("memstore size=" + region.getMemStoreDataSize()); 272 } 273 region.flush(false); 274 } catch (IOException e) { 275 // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later 276 // when roll completes. 277 LOG.info("In flush", e); 278 } 279 LOG.info("Exiting"); 280 }; 281 }; 282 t.setDaemon(true); 283 t.start(); 284 // Wait until 285 while (dodgyWAL.latch.getCount() > 0) { 286 Threads.sleep(1); 287 } 288 // Now assert I got a new WAL file put in place even though loads of errors above. 289 assertTrue(originalWAL != dodgyWAL.getCurrentFileName()); 290 // Can I append to it? 291 dodgyWAL.throwException = false; 292 try { 293 region.put(put); 294 } catch (Exception e) { 295 LOG.info("In the put", e); 296 } 297 } finally { 298 // To stop logRoller, its server has to say it is stopped. 299 Mockito.when(services.isStopped()).thenReturn(true); 300 Closeables.close(logRoller, true); 301 try { 302 if (region != null) { 303 region.close(); 304 } 305 if (dodgyWAL != null) { 306 dodgyWAL.close(); 307 } 308 } catch (Exception e) { 309 LOG.info("On way out", e); 310 } 311 } 312 } 313 314 /** 315 * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in 316 * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317. 317 */ 318 @Test 319 public void testRingBufferEventHandlerStuckWhenSyncFailed() 320 throws IOException, InterruptedException { 321 322 // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down 323 class DodgyFSLog extends FSHLog { 324 325 private volatile boolean zigZagCreated = false; 326 327 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) 328 throws IOException { 329 super(fs, root, logDir, conf); 330 } 331 332 @Override 333 protected void afterCreatingZigZagLatch() { 334 zigZagCreated = true; 335 // Sleep a while to wait for RingBufferEventHandler to get stuck first. 336 try { 337 Thread.sleep(3000); 338 } catch (InterruptedException ignore) { 339 } 340 } 341 342 @Override 343 protected long getSequenceOnRingBuffer() { 344 return super.getSequenceOnRingBuffer(); 345 } 346 347 protected void publishSyncOnRingBufferAndBlock(long sequence) { 348 try { 349 super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false)); 350 Assert.fail("Expect an IOException here."); 351 } catch (IOException ignore) { 352 // Here, we will get an IOException. 353 } 354 } 355 356 @Override 357 protected Writer createWriterInstance(Path path) throws IOException { 358 final Writer w = super.createWriterInstance(path); 359 return new Writer() { 360 @Override 361 public void close() throws IOException { 362 w.close(); 363 } 364 365 @Override 366 public void sync(boolean forceSync) throws IOException { 367 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); 368 } 369 370 @Override 371 public void append(Entry entry) throws IOException { 372 w.append(entry); 373 } 374 375 @Override 376 public long getLength() { 377 return w.getLength(); 378 } 379 380 @Override 381 public long getSyncedLength() { 382 return w.getSyncedLength(); 383 } 384 }; 385 } 386 } 387 388 // Mocked up server and regionserver services. Needed below. 389 RegionServerServices services = Mockito.mock(RegionServerServices.class); 390 Mockito.when(services.getConfiguration()).thenReturn(CONF); 391 Mockito.when(services.isStopped()).thenReturn(false); 392 Mockito.when(services.isAborted()).thenReturn(false); 393 394 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. 395 FileSystem fs = FileSystem.get(CONF); 396 Path rootDir = new Path(dir + getName()); 397 final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); 398 dodgyWAL.init(); 399 // I need a log roller running. 400 LogRoller logRoller = new LogRoller(services); 401 logRoller.addWAL(dodgyWAL); 402 // There is no 'stop' once a logRoller is running.. it just dies. 403 logRoller.start(); 404 405 try { 406 final long seqForSync = dodgyWAL.getSequenceOnRingBuffer(); 407 408 // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch 409 // in LogRoller. 410 // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event, 411 // as long as HBASE-14317 hasn't be fixed. 412 LOG.info("Trigger log roll for creating a ZigZagLatch."); 413 logRoller.requestRollAll(); 414 415 while (!dodgyWAL.zigZagCreated) { 416 Thread.sleep(10); 417 } 418 419 // Send a sync event for RingBufferEventHandler, 420 // and it gets blocked in RingBufferEventHandler.attainSafePoint 421 LOG.info("Send sync for RingBufferEventHandler"); 422 Thread syncThread = new Thread() { 423 @Override 424 public void run() { 425 dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync); 426 } 427 }; 428 // Sync in another thread to avoid reset SyncFuture again. 429 syncThread.start(); 430 syncThread.join(); 431 432 try { 433 LOG.info("Call sync for testing whether RingBufferEventHandler is hanging."); 434 dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this 435 // test. 436 Assert.fail("Expect an IOException here."); 437 } catch (IOException ignore) { 438 } 439 440 } finally { 441 // To stop logRoller, its server has to say it is stopped. 442 Mockito.when(services.isStopped()).thenReturn(true); 443 if (logRoller != null) { 444 logRoller.close(); 445 } 446 if (dodgyWAL != null) { 447 dodgyWAL.close(); 448 } 449 } 450 } 451 452 static class DummyServer implements Server { 453 private Configuration conf; 454 private String serverName; 455 private boolean isAborted = false; 456 457 public DummyServer(Configuration conf, String serverName) { 458 this.conf = conf; 459 this.serverName = serverName; 460 } 461 462 @Override 463 public Configuration getConfiguration() { 464 return conf; 465 } 466 467 @Override 468 public ZKWatcher getZooKeeper() { 469 return null; 470 } 471 472 @Override 473 public CoordinatedStateManager getCoordinatedStateManager() { 474 return null; 475 } 476 477 @Override 478 public ClusterConnection getConnection() { 479 return null; 480 } 481 482 @Override 483 public ServerName getServerName() { 484 return ServerName.valueOf(this.serverName); 485 } 486 487 @Override 488 public void abort(String why, Throwable e) { 489 LOG.info("Aborting " + serverName); 490 this.isAborted = true; 491 } 492 493 @Override 494 public boolean isAborted() { 495 return this.isAborted; 496 } 497 498 @Override 499 public void stop(String why) { 500 this.isAborted = true; 501 } 502 503 @Override 504 public boolean isStopped() { 505 return this.isAborted; 506 } 507 508 @Override 509 public ChoreService getChoreService() { 510 return null; 511 } 512 513 @Override 514 public ClusterConnection getClusterConnection() { 515 return null; 516 } 517 518 @Override 519 public FileSystem getFileSystem() { 520 return null; 521 } 522 523 @Override 524 public boolean isStopping() { 525 return false; 526 } 527 528 @Override 529 public Connection createConnection(Configuration conf) throws IOException { 530 return null; 531 } 532 } 533 534 /** 535 * @return A region on which you must call {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} 536 * when done. 537 */ 538 private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, 539 Configuration conf, WAL wal) throws IOException { 540 ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null, 541 MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT); 542 return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false, 543 Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES); 544 } 545}