001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.Assert.assertTrue;
021
022import java.io.IOException;
023import java.util.NavigableMap;
024import java.util.TreeMap;
025import java.util.concurrent.CountDownLatch;
026import java.util.concurrent.TimeUnit;
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.fs.FileSystem;
029import org.apache.hadoop.fs.Path;
030import org.apache.hadoop.hbase.CellScanner;
031import org.apache.hadoop.hbase.HBaseClassTestRule;
032import org.apache.hadoop.hbase.HBaseTestingUtil;
033import org.apache.hadoop.hbase.HConstants;
034import org.apache.hadoop.hbase.TableName;
035import org.apache.hadoop.hbase.client.Durability;
036import org.apache.hadoop.hbase.client.Put;
037import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
038import org.apache.hadoop.hbase.testclassification.MediumTests;
039import org.apache.hadoop.hbase.testclassification.RegionServerTests;
040import org.apache.hadoop.hbase.util.Bytes;
041import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
042import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
043import org.apache.hadoop.hbase.util.Threads;
044import org.apache.hadoop.hbase.wal.WAL;
045import org.apache.hadoop.hbase.wal.WALEdit;
046import org.apache.hadoop.hbase.wal.WALKeyImpl;
047import org.apache.hadoop.hbase.wal.WALProvider.Writer;
048import org.junit.After;
049import org.junit.Assert;
050import org.junit.Before;
051import org.junit.ClassRule;
052import org.junit.Rule;
053import org.junit.Test;
054import org.junit.experimental.categories.Category;
055import org.junit.rules.TestName;
056import org.mockito.Mockito;
057import org.slf4j.Logger;
058import org.slf4j.LoggerFactory;
059
060import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
061
062/**
063 * Testing for lock up of FSHLog.
064 */
065@Category({ RegionServerTests.class, MediumTests.class })
066public class TestWALLockup {
067
068  @ClassRule
069  public static final HBaseClassTestRule CLASS_RULE =
070    HBaseClassTestRule.forClass(TestWALLockup.class);
071
072  private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class);
073
074  @Rule
075  public TestName name = new TestName();
076
077  private static final String COLUMN_FAMILY = "MyCF";
078  private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
079
080  private static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
081  private static Configuration CONF;
082  private String dir;
083
084  // Test names
085  protected TableName tableName;
086
087  @Before
088  public void setup() throws IOException {
089    CONF = TEST_UTIL.getConfiguration();
090    // Disable block cache.
091    CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
092    dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
093    tableName = TableName.valueOf(name.getMethodName());
094  }
095
096  @After
097  public void tearDown() throws Exception {
098    EnvironmentEdgeManagerTestHelper.reset();
099    LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
100    TEST_UTIL.cleanupTestDir();
101  }
102
103  private String getName() {
104    return name.getMethodName();
105  }
106
107  // A WAL that we can have throw exceptions when a flag is set.
108  private static final class DodgyFSLog extends FSHLog {
109    // Set this when want the WAL to start throwing exceptions.
110    volatile boolean throwException = false;
111
112    // Latch to hold up processing until after another operation has had time to run.
113    CountDownLatch latch = new CountDownLatch(1);
114
115    public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
116      throws IOException {
117      super(fs, root, logDir, conf);
118    }
119
120    @Override
121    protected void afterCreatingZigZagLatch() {
122      // If throwException set, then append will throw an exception causing the WAL to be
123      // rolled. We'll come in here. Hold up processing until a sync can get in before
124      // the zigzag has time to complete its setup and get its own sync in. This is what causes
125      // the lock up we've seen in production.
126      if (throwException) {
127        try {
128          LOG.info("LATCHED");
129          // So, timing can have it that the test can run and the bad flush below happens
130          // before we get here. In this case, we'll be stuck waiting on this latch but there
131          // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
132          // because all WALs have rolled. In this case, just give up on test.
133          if (!this.latch.await(5, TimeUnit.SECONDS)) {
134            LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
135          }
136        } catch (InterruptedException e) {
137        }
138      }
139    }
140
141    @Override
142    protected void beforeWaitOnSafePoint() {
143      if (throwException) {
144        LOG.info("COUNTDOWN");
145        // Don't countdown latch until someone waiting on it otherwise, the above
146        // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
147        // be stuck; test won't go down
148        while (this.latch.getCount() <= 0)
149          Threads.sleep(1);
150        this.latch.countDown();
151      }
152    }
153
154    @Override
155    protected Writer createWriterInstance(Path path) throws IOException {
156      final Writer w = super.createWriterInstance(path);
157      return new Writer() {
158        @Override
159        public void close() throws IOException {
160          w.close();
161        }
162
163        @Override
164        public void sync(boolean forceSync) throws IOException {
165          if (throwException) {
166            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
167          }
168          w.sync(forceSync);
169        }
170
171        @Override
172        public void append(Entry entry) throws IOException {
173          if (throwException) {
174            throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
175          }
176          w.append(entry);
177        }
178
179        @Override
180        public long getLength() {
181          return w.getLength();
182        }
183
184        @Override
185        public long getSyncedLength() {
186          return w.getSyncedLength();
187        }
188      };
189    }
190  }
191
192  /**
193   * Reproduce locking up that happens when we get an inopportune sync during setup for zigzaglatch
194   * wait. See HBASE-14317. If below is broken, we will see this test timeout because it is locked
195   * up.
196   * <p>
197   * First I need to set up some mocks for Server and RegionServerServices. I also need to set up a
198   * dodgy WAL that will throw an exception when we go to append to it.
199   */
200  @Test
201  public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
202    // Mocked up server and regionserver services. Needed below.
203    RegionServerServices services = Mockito.mock(RegionServerServices.class);
204    Mockito.when(services.getConfiguration()).thenReturn(CONF);
205    Mockito.when(services.isStopped()).thenReturn(false);
206    Mockito.when(services.isAborted()).thenReturn(false);
207
208    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
209    FileSystem fs = FileSystem.get(CONF);
210    Path rootDir = new Path(dir + getName());
211    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
212    dodgyWAL.init();
213    Path originalWAL = dodgyWAL.getCurrentFileName();
214    // I need a log roller running.
215    LogRoller logRoller = new LogRoller(services);
216    logRoller.addWAL(dodgyWAL);
217    // There is no 'stop' once a logRoller is running.. it just dies.
218    logRoller.start();
219    // Now get a region and start adding in edits.
220    final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
221    byte[] bytes = Bytes.toBytes(getName());
222    NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
223    scopes.put(COLUMN_FAMILY_BYTES, 0);
224    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
225    try {
226      // First get something into memstore. Make a Put and then pull the Cell out of it. Will
227      // manage append and sync carefully in below to manufacture hang. We keep adding same
228      // edit. WAL subsystem doesn't care.
229      Put put = new Put(bytes);
230      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
231      WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(),
232        TableName.META_TABLE_NAME, EnvironmentEdgeManager.currentTime(), mvcc, scopes);
233      WALEdit edit = new WALEdit();
234      CellScanner CellScanner = put.cellScanner();
235      assertTrue(CellScanner.advance());
236      edit.add(CellScanner.current());
237      // Put something in memstore and out in the WAL. Do a big number of appends so we push
238      // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
239      for (int i = 0; i < 1000; i++) {
240        region.put(put);
241      }
242      // Set it so we start throwing exceptions.
243      LOG.info("SET throwing of exception on append");
244      dodgyWAL.throwException = true;
245      // This append provokes a WAL roll request
246      dodgyWAL.appendData(region.getRegionInfo(), key, edit);
247      boolean exception = false;
248      try {
249        dodgyWAL.sync(false);
250      } catch (Exception e) {
251        exception = true;
252      }
253      assertTrue("Did not get sync exception", exception);
254
255      // Get a memstore flush going too so we have same hung profile as up in the issue over
256      // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
257      // by the zigzaglatch waiting on syncs to come home.
258      Thread t = new Thread("Flusher") {
259        @Override
260        public void run() {
261          try {
262            if (region.getMemStoreDataSize() <= 0) {
263              throw new IOException("memstore size=" + region.getMemStoreDataSize());
264            }
265            region.flush(false);
266          } catch (IOException e) {
267            // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
268            // when roll completes.
269            LOG.info("In flush", e);
270          }
271          LOG.info("Exiting");
272        }
273      };
274      t.setDaemon(true);
275      t.start();
276      // Wait until
277      while (dodgyWAL.latch.getCount() > 0) {
278        Threads.sleep(1);
279      }
280      // Now assert I got a new WAL file put in place even though loads of errors above.
281      assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
282      // Can I append to it?
283      dodgyWAL.throwException = false;
284      try {
285        region.put(put);
286      } catch (Exception e) {
287        LOG.info("In the put", e);
288      }
289    } finally {
290      // To stop logRoller, its server has to say it is stopped.
291      Mockito.when(services.isStopped()).thenReturn(true);
292      Closeables.close(logRoller, true);
293      try {
294        if (region != null) {
295          region.close();
296        }
297        if (dodgyWAL != null) {
298          dodgyWAL.close();
299        }
300      } catch (Exception e) {
301        LOG.info("On way out", e);
302      }
303    }
304  }
305
306  /**
307   * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
308   * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
309   */
310  @Test
311  public void testRingBufferEventHandlerStuckWhenSyncFailed()
312    throws IOException, InterruptedException {
313
314    // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
315    class DodgyFSLog extends FSHLog {
316
317      private volatile boolean zigZagCreated = false;
318
319      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
320        throws IOException {
321        super(fs, root, logDir, conf);
322      }
323
324      @Override
325      protected void afterCreatingZigZagLatch() {
326        zigZagCreated = true;
327        // Sleep a while to wait for RingBufferEventHandler to get stuck first.
328        try {
329          Thread.sleep(3000);
330        } catch (InterruptedException ignore) {
331        }
332      }
333
334      @Override
335      protected long getSequenceOnRingBuffer() {
336        return super.getSequenceOnRingBuffer();
337      }
338
339      protected void publishSyncOnRingBufferAndBlock(long sequence) {
340        try {
341          super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false));
342          Assert.fail("Expect an IOException here.");
343        } catch (IOException ignore) {
344          // Here, we will get an IOException.
345        }
346      }
347
348      @Override
349      protected Writer createWriterInstance(Path path) throws IOException {
350        final Writer w = super.createWriterInstance(path);
351        return new Writer() {
352          @Override
353          public void close() throws IOException {
354            w.close();
355          }
356
357          @Override
358          public void sync(boolean forceSync) throws IOException {
359            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
360          }
361
362          @Override
363          public void append(Entry entry) throws IOException {
364            w.append(entry);
365          }
366
367          @Override
368          public long getLength() {
369            return w.getLength();
370          }
371
372          @Override
373          public long getSyncedLength() {
374            return w.getSyncedLength();
375          }
376        };
377      }
378    }
379
380    // Mocked up server and regionserver services. Needed below.
381    RegionServerServices services = Mockito.mock(RegionServerServices.class);
382    Mockito.when(services.getConfiguration()).thenReturn(CONF);
383    Mockito.when(services.isStopped()).thenReturn(false);
384    Mockito.when(services.isAborted()).thenReturn(false);
385
386    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
387    FileSystem fs = FileSystem.get(CONF);
388    Path rootDir = new Path(dir + getName());
389    final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
390    // I need a log roller running.
391    LogRoller logRoller = new LogRoller(services);
392    logRoller.addWAL(dodgyWAL);
393    // There is no 'stop' once a logRoller is running.. it just dies.
394    logRoller.start();
395
396    try {
397      final long seqForSync = dodgyWAL.getSequenceOnRingBuffer();
398
399      // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
400      // in LogRoller.
401      // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
402      // as long as HBASE-14317 hasn't be fixed.
403      LOG.info("Trigger log roll for creating a ZigZagLatch.");
404      logRoller.requestRollAll();
405
406      while (!dodgyWAL.zigZagCreated) {
407        Thread.sleep(10);
408      }
409
410      // Send a sync event for RingBufferEventHandler,
411      // and it gets blocked in RingBufferEventHandler.attainSafePoint
412      LOG.info("Send sync for RingBufferEventHandler");
413      Thread syncThread = new Thread() {
414        @Override
415        public void run() {
416          dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync);
417        }
418      };
419      // Sync in another thread to avoid reset SyncFuture again.
420      syncThread.start();
421      syncThread.join();
422
423      try {
424        LOG.info("Call sync for testing whether RingBufferEventHandler is hanging.");
425        dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this
426                              // test.
427        Assert.fail("Expect an IOException here.");
428      } catch (IOException ignore) {
429      }
430
431    } finally {
432      // To stop logRoller, its server has to say it is stopped.
433      Mockito.when(services.isStopped()).thenReturn(true);
434      if (logRoller != null) {
435        logRoller.close();
436      }
437      if (dodgyWAL != null) {
438        dodgyWAL.close();
439      }
440    }
441  }
442
443  /**
444   * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)}
445   *         when done.
446   */
447  private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
448    Configuration conf, WAL wal) throws IOException {
449    ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null,
450      MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
451    return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
452      Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
453  }
454}