001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.Assert.assertTrue;
021
022import java.io.IOException;
023import java.util.NavigableMap;
024import java.util.TreeMap;
025import java.util.concurrent.CountDownLatch;
026import java.util.concurrent.TimeUnit;
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.fs.FileSystem;
029import org.apache.hadoop.fs.Path;
030import org.apache.hadoop.hbase.CellScanner;
031import org.apache.hadoop.hbase.ChoreService;
032import org.apache.hadoop.hbase.CoordinatedStateManager;
033import org.apache.hadoop.hbase.HBaseClassTestRule;
034import org.apache.hadoop.hbase.HBaseTestingUtility;
035import org.apache.hadoop.hbase.HConstants;
036import org.apache.hadoop.hbase.Server;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.TableName;
039import org.apache.hadoop.hbase.client.ClusterConnection;
040import org.apache.hadoop.hbase.client.Connection;
041import org.apache.hadoop.hbase.client.Durability;
042import org.apache.hadoop.hbase.client.Put;
043import org.apache.hadoop.hbase.regionserver.wal.DamagedWALException;
044import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
045import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
046import org.apache.hadoop.hbase.testclassification.MediumTests;
047import org.apache.hadoop.hbase.testclassification.RegionServerTests;
048import org.apache.hadoop.hbase.util.Bytes;
049import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
050import org.apache.hadoop.hbase.util.Threads;
051import org.apache.hadoop.hbase.wal.WAL;
052import org.apache.hadoop.hbase.wal.WALEdit;
053import org.apache.hadoop.hbase.wal.WALKey;
054import org.apache.hadoop.hbase.wal.WALKeyImpl;
055import org.apache.hadoop.hbase.wal.WALProvider.Writer;
056import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
057import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
058import org.junit.After;
059import org.junit.Assert;
060import org.junit.Before;
061import org.junit.ClassRule;
062import org.junit.Rule;
063import org.junit.Test;
064import org.junit.experimental.categories.Category;
065import org.junit.rules.TestName;
066import org.mockito.Mockito;
067import org.slf4j.Logger;
068import org.slf4j.LoggerFactory;
069
070import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
071
072/**
073 * Testing for lock up of FSHLog.
074 */
075@Category({ RegionServerTests.class, MediumTests.class })
076public class TestWALLockup {
077
078  @ClassRule
079  public static final HBaseClassTestRule CLASS_RULE =
080      HBaseClassTestRule.forClass(TestWALLockup.class);
081
082  private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class);
083
084  @Rule
085  public TestName name = new TestName();
086
087  private static final String COLUMN_FAMILY = "MyCF";
088  private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
089
090  HRegion region = null;
091  private static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
092  private static Configuration CONF ;
093  private String dir;
094
095  // Test names
096  protected TableName tableName;
097
098  @Before
099  public void setup() throws IOException {
100    CONF = TEST_UTIL.getConfiguration();
101    // Disable block cache.
102    CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
103    dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
104    tableName = TableName.valueOf(name.getMethodName());
105  }
106
107  @After
108  public void tearDown() throws Exception {
109    EnvironmentEdgeManagerTestHelper.reset();
110    LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
111    TEST_UTIL.cleanupTestDir();
112  }
113
114  private String getName() {
115    return name.getMethodName();
116  }
117
118  // A WAL that we can have throw exceptions when a flag is set.
119  private static final class DodgyFSLog extends FSHLog {
120    // Set this when want the WAL to start throwing exceptions.
121    volatile boolean throwException = false;
122
123    // Latch to hold up processing until after another operation has had time to run.
124    CountDownLatch latch = new CountDownLatch(1);
125
126    public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
127        throws IOException {
128      super(fs, root, logDir, conf);
129    }
130
131    @Override
132    protected void afterCreatingZigZagLatch() {
133      // If throwException set, then append will throw an exception causing the WAL to be
134      // rolled. We'll come in here. Hold up processing until a sync can get in before
135      // the zigzag has time to complete its setup and get its own sync in. This is what causes
136      // the lock up we've seen in production.
137      if (throwException) {
138        try {
139          LOG.info("LATCHED");
140          // So, timing can have it that the test can run and the bad flush below happens
141          // before we get here. In this case, we'll be stuck waiting on this latch but there
142          // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
143          // because all WALs have rolled. In this case, just give up on test.
144          if (!this.latch.await(5, TimeUnit.SECONDS)) {
145            LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
146          }
147        } catch (InterruptedException e) {
148        }
149      }
150    }
151
152    @Override
153    protected void beforeWaitOnSafePoint() {
154      if (throwException) {
155        LOG.info("COUNTDOWN");
156        // Don't countdown latch until someone waiting on it otherwise, the above
157        // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
158        // be stuck; test won't go down
159        while (this.latch.getCount() <= 0)
160          Threads.sleep(1);
161        this.latch.countDown();
162      }
163    }
164
165    @Override
166    protected Writer createWriterInstance(Path path) throws IOException {
167      final Writer w = super.createWriterInstance(path);
168      return new Writer() {
169        @Override
170        public void close() throws IOException {
171          w.close();
172        }
173
174        @Override
175        public void sync() throws IOException {
176          if (throwException) {
177            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
178          }
179          w.sync();
180        }
181
182        @Override
183        public void append(Entry entry) throws IOException {
184          if (throwException) {
185            throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
186          }
187          w.append(entry);
188        }
189
190        @Override
191        public long getLength() {
192          return w.getLength();
193        }
194      };
195    }
196  }
197
198  /**
199   * Reproduce locking up that happens when we get an inopportune sync during setup for
200   * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
201   * it is locked up.
202   * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
203   * set up a dodgy WAL that will throw an exception when we go to append to it.
204   */
205  @Test
206  public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
207    // Mocked up server and regionserver services. Needed below.
208    Server server = Mockito.mock(Server.class);
209    Mockito.when(server.getConfiguration()).thenReturn(CONF);
210    Mockito.when(server.isStopped()).thenReturn(false);
211    Mockito.when(server.isAborted()).thenReturn(false);
212    RegionServerServices services = Mockito.mock(RegionServerServices.class);
213
214    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
215    FileSystem fs = FileSystem.get(CONF);
216    Path rootDir = new Path(dir + getName());
217    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
218    Path originalWAL = dodgyWAL.getCurrentFileName();
219    // I need a log roller running.
220    LogRoller logRoller = new LogRoller(server, services);
221    logRoller.addWAL(dodgyWAL);
222    // There is no 'stop' once a logRoller is running.. it just dies.
223    logRoller.start();
224    // Now get a region and start adding in edits.
225    final HRegion region = initHRegion(tableName, null, null, dodgyWAL);
226    byte [] bytes = Bytes.toBytes(getName());
227    NavigableMap<byte[], Integer> scopes = new TreeMap<>(
228        Bytes.BYTES_COMPARATOR);
229    scopes.put(COLUMN_FAMILY_BYTES, 0);
230    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
231    try {
232      // First get something into memstore. Make a Put and then pull the Cell out of it. Will
233      // manage append and sync carefully in below to manufacture hang. We keep adding same
234      // edit. WAL subsystem doesn't care.
235      Put put = new Put(bytes);
236      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
237      WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(),
238        TableName.META_TABLE_NAME, System.currentTimeMillis(), mvcc, scopes);
239      WALEdit edit = new WALEdit();
240      CellScanner CellScanner = put.cellScanner();
241      assertTrue(CellScanner.advance());
242      edit.add(CellScanner.current());
243      // Put something in memstore and out in the WAL. Do a big number of appends so we push
244      // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
245      for (int i = 0; i < 1000; i++) {
246        region.put(put);
247      }
248      // Set it so we start throwing exceptions.
249      LOG.info("SET throwing of exception on append");
250      dodgyWAL.throwException = true;
251      // This append provokes a WAL roll request
252      dodgyWAL.append(region.getRegionInfo(), key, edit, true);
253      boolean exception = false;
254      try {
255        dodgyWAL.sync();
256      } catch (Exception e) {
257        exception = true;
258      }
259      assertTrue("Did not get sync exception", exception);
260
261      // Get a memstore flush going too so we have same hung profile as up in the issue over
262      // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
263      // by the zigzaglatch waiting on syncs to come home.
264      Thread t = new Thread ("Flusher") {
265        @Override
266        public void run() {
267          try {
268            if (region.getMemStoreDataSize() <= 0) {
269              throw new IOException("memstore size=" + region.getMemStoreDataSize());
270            }
271            region.flush(false);
272          } catch (IOException e) {
273            // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
274            // when roll completes.
275            LOG.info("In flush", e);
276          }
277          LOG.info("Exiting");
278        };
279      };
280      t.setDaemon(true);
281      t.start();
282      // Wait until
283      while (dodgyWAL.latch.getCount() > 0) {
284        Threads.sleep(1);
285      }
286      // Now assert I got a new WAL file put in place even though loads of errors above.
287      assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
288      // Can I append to it?
289      dodgyWAL.throwException = false;
290      try {
291        region.put(put);
292      } catch (Exception e) {
293        LOG.info("In the put", e);
294      }
295    } finally {
296      // To stop logRoller, its server has to say it is stopped.
297      Mockito.when(server.isStopped()).thenReturn(true);
298      Closeables.close(logRoller, true);
299      try {
300        if (region != null) {
301          region.close();
302        }
303        if (dodgyWAL != null) {
304          dodgyWAL.close();
305        }
306      } catch (Exception e) {
307        LOG.info("On way out", e);
308      }
309    }
310  }
311
312  /**
313   *
314   * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
315   * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
316   */
317  @Test (timeout=30000)
318  public void testRingBufferEventHandlerStuckWhenSyncFailed()
319    throws IOException, InterruptedException {
320
321    // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
322    class DodgyFSLog extends FSHLog {
323
324      private volatile boolean zigZagCreated = false;
325
326      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
327        throws IOException {
328        super(fs, root, logDir, conf);
329      }
330
331      @Override
332      protected void afterCreatingZigZagLatch() {
333        zigZagCreated = true;
334        // Sleep a while to wait for RingBufferEventHandler to get stuck first.
335        try {
336          Thread.sleep(3000);
337        } catch (InterruptedException ignore) {
338        }
339      }
340
341      @Override
342      protected long getSequenceOnRingBuffer() {
343        return super.getSequenceOnRingBuffer();
344      }
345
346      protected void publishSyncOnRingBufferAndBlock(long sequence) {
347        try {
348          super.blockOnSync(super.publishSyncOnRingBuffer(sequence));
349          Assert.fail("Expect an IOException here.");
350        } catch (IOException ignore) {
351          // Here, we will get an IOException.
352        }
353      }
354
355      @Override
356      protected Writer createWriterInstance(Path path) throws IOException {
357        final Writer w = super.createWriterInstance(path);
358        return new Writer() {
359          @Override
360          public void close() throws IOException {
361            w.close();
362          }
363
364          @Override
365          public void sync() throws IOException {
366            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
367          }
368
369          @Override
370          public void append(Entry entry) throws IOException {
371            w.append(entry);
372          }
373
374          @Override
375          public long getLength() {
376            return w.getLength();
377          }
378        };
379      }
380    }
381
382    // Mocked up server and regionserver services. Needed below.
383    final Server server = Mockito.mock(Server.class);
384    Mockito.when(server.getConfiguration()).thenReturn(CONF);
385    Mockito.when(server.isStopped()).thenReturn(false);
386    Mockito.when(server.isAborted()).thenReturn(false);
387    RegionServerServices services = Mockito.mock(RegionServerServices.class);
388
389    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
390    FileSystem fs = FileSystem.get(CONF);
391    Path rootDir = new Path(dir + getName());
392    final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
393    // I need a log roller running.
394    LogRoller logRoller = new LogRoller(server, services);
395    logRoller.addWAL(dodgyWAL);
396    // There is no 'stop' once a logRoller is running.. it just dies.
397    logRoller.start();
398
399    try {
400      final long seqForSync = dodgyWAL.getSequenceOnRingBuffer();
401
402      // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
403      // in LogRoller.
404      // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
405      // as long as HBASE-14317 hasn't be fixed.
406      LOG.info("Trigger log roll for creating a ZigZagLatch.");
407      logRoller.requestRollAll();
408
409      while (!dodgyWAL.zigZagCreated) {
410        Thread.sleep(10);
411      }
412
413      // Send a sync event for RingBufferEventHandler,
414      // and it gets blocked in RingBufferEventHandler.attainSafePoint
415      LOG.info("Send sync for RingBufferEventHandler");
416      Thread syncThread = new Thread() {
417        @Override
418        public void run() {
419          dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync);
420        }
421      };
422      // Sync in another thread to avoid reset SyncFuture again.
423      syncThread.start();
424      syncThread.join();
425
426      try {
427        LOG.info("Call sync for testing whether RingBufferEventHandler is hanging.");
428        dodgyWAL.sync(); // Should not get a hang here, otherwise we will see timeout in this test.
429        Assert.fail("Expect an IOException here.");
430      } catch (IOException ignore) {
431      }
432
433    } finally {
434      // To stop logRoller, its server has to say it is stopped.
435      Mockito.when(server.isStopped()).thenReturn(true);
436      if (logRoller != null) {
437        logRoller.close();
438      }
439      if (dodgyWAL != null) {
440        dodgyWAL.close();
441      }
442    }
443  }
444
445
446  static class DummyServer implements Server {
447    private Configuration conf;
448    private String serverName;
449    private boolean isAborted = false;
450
451    public DummyServer(Configuration conf, String serverName) {
452      this.conf = conf;
453      this.serverName = serverName;
454    }
455
456    @Override
457    public Configuration getConfiguration() {
458      return conf;
459    }
460
461    @Override
462    public ZKWatcher getZooKeeper() {
463      return null;
464    }
465
466    @Override
467    public CoordinatedStateManager getCoordinatedStateManager() {
468      return null;
469    }
470
471    @Override
472    public ClusterConnection getConnection() {
473      return null;
474    }
475
476    @Override
477    public MetaTableLocator getMetaTableLocator() {
478      return null;
479    }
480
481    @Override
482    public ServerName getServerName() {
483      return ServerName.valueOf(this.serverName);
484    }
485
486    @Override
487    public void abort(String why, Throwable e) {
488      LOG.info("Aborting " + serverName);
489      this.isAborted = true;
490    }
491
492    @Override
493    public boolean isAborted() {
494      return this.isAborted;
495    }
496
497    @Override
498    public void stop(String why) {
499      this.isAborted = true;
500    }
501
502    @Override
503    public boolean isStopped() {
504      return this.isAborted;
505    }
506
507    @Override
508    public ChoreService getChoreService() {
509      return null;
510    }
511
512    @Override
513    public ClusterConnection getClusterConnection() {
514      return null;
515    }
516
517    @Override
518    public FileSystem getFileSystem() {
519      return null;
520    }
521
522    @Override
523    public boolean isStopping() {
524      return false;
525    }
526
527    @Override
528    public Connection createConnection(Configuration conf) throws IOException {
529      return null;
530    }
531  }
532
533  static class DummyWALActionsListener implements WALActionsListener {
534
535    @Override
536    public void visitLogEntryBeforeWrite(WALKey logKey, WALEdit logEdit)
537        throws IOException {
538      if (logKey.getTableName().getNameAsString().equalsIgnoreCase("sleep")) {
539        try {
540          Thread.sleep(1000);
541        } catch (InterruptedException e) {
542          e.printStackTrace();
543        }
544      }
545      if (logKey.getTableName().getNameAsString()
546          .equalsIgnoreCase("DamagedWALException")) {
547        throw new DamagedWALException("Failed appending");
548      }
549    }
550
551  }
552
553  /**
554   * @return A region on which you must call {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)}
555   *         when done.
556   */
557  private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, WAL wal)
558      throws IOException {
559    ChunkCreator.initialize(MemStoreLABImpl.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null);
560    return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, false, Durability.SYNC_WAL,
561      wal, COLUMN_FAMILY_BYTES);
562  }
563}