001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.Assert.assertTrue;
021
022import java.io.IOException;
023import java.util.NavigableMap;
024import java.util.TreeMap;
025import java.util.concurrent.CountDownLatch;
026import java.util.concurrent.TimeUnit;
027
028import org.apache.hadoop.conf.Configuration;
029import org.apache.hadoop.fs.FileSystem;
030import org.apache.hadoop.fs.Path;
031import org.apache.hadoop.hbase.CellScanner;
032import org.apache.hadoop.hbase.HBaseClassTestRule;
033import org.apache.hadoop.hbase.HBaseTestingUtil;
034import org.apache.hadoop.hbase.HConstants;
035import org.apache.hadoop.hbase.TableName;
036import org.apache.hadoop.hbase.client.Durability;
037import org.apache.hadoop.hbase.client.Put;
038import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
039import org.apache.hadoop.hbase.testclassification.MediumTests;
040import org.apache.hadoop.hbase.testclassification.RegionServerTests;
041import org.apache.hadoop.hbase.util.Bytes;
042import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
043import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
044import org.apache.hadoop.hbase.util.Threads;
045import org.apache.hadoop.hbase.wal.WAL;
046import org.apache.hadoop.hbase.wal.WALEdit;
047import org.apache.hadoop.hbase.wal.WALKeyImpl;
048import org.apache.hadoop.hbase.wal.WALProvider.Writer;
049import org.junit.After;
050import org.junit.Assert;
051import org.junit.Before;
052import org.junit.ClassRule;
053import org.junit.Rule;
054import org.junit.Test;
055import org.junit.experimental.categories.Category;
056import org.junit.rules.TestName;
057import org.mockito.Mockito;
058import org.slf4j.Logger;
059import org.slf4j.LoggerFactory;
060
061import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
062
063/**
064 * Testing for lock up of FSHLog.
065 */
066@Category({ RegionServerTests.class, MediumTests.class })
067public class TestWALLockup {
068
069  @ClassRule
070  public static final HBaseClassTestRule CLASS_RULE =
071      HBaseClassTestRule.forClass(TestWALLockup.class);
072
073  private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class);
074
075  @Rule
076  public TestName name = new TestName();
077
078  private static final String COLUMN_FAMILY = "MyCF";
079  private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
080
081  private static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
082  private static Configuration CONF ;
083  private String dir;
084
085  // Test names
086  protected TableName tableName;
087
088  @Before
089  public void setup() throws IOException {
090    CONF = TEST_UTIL.getConfiguration();
091    // Disable block cache.
092    CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
093    dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
094    tableName = TableName.valueOf(name.getMethodName());
095  }
096
097  @After
098  public void tearDown() throws Exception {
099    EnvironmentEdgeManagerTestHelper.reset();
100    LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
101    TEST_UTIL.cleanupTestDir();
102  }
103
104  private String getName() {
105    return name.getMethodName();
106  }
107
108  // A WAL that we can have throw exceptions when a flag is set.
109  private static final class DodgyFSLog extends FSHLog {
110    // Set this when want the WAL to start throwing exceptions.
111    volatile boolean throwException = false;
112
113    // Latch to hold up processing until after another operation has had time to run.
114    CountDownLatch latch = new CountDownLatch(1);
115
116    public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
117        throws IOException {
118      super(fs, root, logDir, conf);
119    }
120
121    @Override
122    protected void afterCreatingZigZagLatch() {
123      // If throwException set, then append will throw an exception causing the WAL to be
124      // rolled. We'll come in here. Hold up processing until a sync can get in before
125      // the zigzag has time to complete its setup and get its own sync in. This is what causes
126      // the lock up we've seen in production.
127      if (throwException) {
128        try {
129          LOG.info("LATCHED");
130          // So, timing can have it that the test can run and the bad flush below happens
131          // before we get here. In this case, we'll be stuck waiting on this latch but there
132          // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
133          // because all WALs have rolled. In this case, just give up on test.
134          if (!this.latch.await(5, TimeUnit.SECONDS)) {
135            LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
136          }
137        } catch (InterruptedException e) {
138        }
139      }
140    }
141
142    @Override
143    protected void beforeWaitOnSafePoint() {
144      if (throwException) {
145        LOG.info("COUNTDOWN");
146        // Don't countdown latch until someone waiting on it otherwise, the above
147        // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
148        // be stuck; test won't go down
149        while (this.latch.getCount() <= 0)
150          Threads.sleep(1);
151        this.latch.countDown();
152      }
153    }
154
155    @Override
156    protected Writer createWriterInstance(Path path) throws IOException {
157      final Writer w = super.createWriterInstance(path);
158      return new Writer() {
159        @Override
160        public void close() throws IOException {
161          w.close();
162        }
163
164        @Override
165        public void sync(boolean forceSync) throws IOException {
166          if (throwException) {
167            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
168          }
169          w.sync(forceSync);
170        }
171
172        @Override
173        public void append(Entry entry) throws IOException {
174          if (throwException) {
175            throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
176          }
177          w.append(entry);
178        }
179
180        @Override
181        public long getLength() {
182          return w.getLength();
183        }
184
185        @Override
186        public long getSyncedLength() {
187          return w.getSyncedLength();
188        }
189      };
190    }
191  }
192
193  /**
194   * Reproduce locking up that happens when we get an inopportune sync during setup for
195   * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
196   * it is locked up.
197   * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
198   * set up a dodgy WAL that will throw an exception when we go to append to it.
199   */
200  @Test
201  public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
202    // Mocked up server and regionserver services. Needed below.
203    RegionServerServices services = Mockito.mock(RegionServerServices.class);
204    Mockito.when(services.getConfiguration()).thenReturn(CONF);
205    Mockito.when(services.isStopped()).thenReturn(false);
206    Mockito.when(services.isAborted()).thenReturn(false);
207
208    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
209    FileSystem fs = FileSystem.get(CONF);
210    Path rootDir = new Path(dir + getName());
211    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
212    dodgyWAL.init();
213    Path originalWAL = dodgyWAL.getCurrentFileName();
214    // I need a log roller running.
215    LogRoller logRoller = new LogRoller(services);
216    logRoller.addWAL(dodgyWAL);
217    // There is no 'stop' once a logRoller is running.. it just dies.
218    logRoller.start();
219    // Now get a region and start adding in edits.
220    final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
221    byte [] bytes = Bytes.toBytes(getName());
222    NavigableMap<byte[], Integer> scopes = new TreeMap<>(
223        Bytes.BYTES_COMPARATOR);
224    scopes.put(COLUMN_FAMILY_BYTES, 0);
225    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
226    try {
227      // First get something into memstore. Make a Put and then pull the Cell out of it. Will
228      // manage append and sync carefully in below to manufacture hang. We keep adding same
229      // edit. WAL subsystem doesn't care.
230      Put put = new Put(bytes);
231      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
232      WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(),
233        TableName.META_TABLE_NAME, EnvironmentEdgeManager.currentTime(), mvcc, scopes);
234      WALEdit edit = new WALEdit();
235      CellScanner CellScanner = put.cellScanner();
236      assertTrue(CellScanner.advance());
237      edit.add(CellScanner.current());
238      // Put something in memstore and out in the WAL. Do a big number of appends so we push
239      // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
240      for (int i = 0; i < 1000; i++) {
241        region.put(put);
242      }
243      // Set it so we start throwing exceptions.
244      LOG.info("SET throwing of exception on append");
245      dodgyWAL.throwException = true;
246      // This append provokes a WAL roll request
247      dodgyWAL.appendData(region.getRegionInfo(), key, edit);
248      boolean exception = false;
249      try {
250        dodgyWAL.sync(false);
251      } catch (Exception e) {
252        exception = true;
253      }
254      assertTrue("Did not get sync exception", exception);
255
256      // Get a memstore flush going too so we have same hung profile as up in the issue over
257      // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
258      // by the zigzaglatch waiting on syncs to come home.
259      Thread t = new Thread ("Flusher") {
260        @Override
261        public void run() {
262          try {
263            if (region.getMemStoreDataSize() <= 0) {
264              throw new IOException("memstore size=" + region.getMemStoreDataSize());
265            }
266            region.flush(false);
267          } catch (IOException e) {
268            // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
269            // when roll completes.
270            LOG.info("In flush", e);
271          }
272          LOG.info("Exiting");
273        }
274      };
275      t.setDaemon(true);
276      t.start();
277      // Wait until
278      while (dodgyWAL.latch.getCount() > 0) {
279        Threads.sleep(1);
280      }
281      // Now assert I got a new WAL file put in place even though loads of errors above.
282      assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
283      // Can I append to it?
284      dodgyWAL.throwException = false;
285      try {
286        region.put(put);
287      } catch (Exception e) {
288        LOG.info("In the put", e);
289      }
290    } finally {
291      // To stop logRoller, its server has to say it is stopped.
292      Mockito.when(services.isStopped()).thenReturn(true);
293      Closeables.close(logRoller, true);
294      try {
295        if (region != null) {
296          region.close();
297        }
298        if (dodgyWAL != null) {
299          dodgyWAL.close();
300        }
301      } catch (Exception e) {
302        LOG.info("On way out", e);
303      }
304    }
305  }
306
307  /**
308   *
309   * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
310   * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
311   */
312  @Test
313  public void testRingBufferEventHandlerStuckWhenSyncFailed()
314    throws IOException, InterruptedException {
315
316    // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
317    class DodgyFSLog extends FSHLog {
318
319      private volatile boolean zigZagCreated = false;
320
321      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
322        throws IOException {
323        super(fs, root, logDir, conf);
324      }
325
326      @Override
327      protected void afterCreatingZigZagLatch() {
328        zigZagCreated = true;
329        // Sleep a while to wait for RingBufferEventHandler to get stuck first.
330        try {
331          Thread.sleep(3000);
332        } catch (InterruptedException ignore) {
333        }
334      }
335
336      @Override
337      protected long getSequenceOnRingBuffer() {
338        return super.getSequenceOnRingBuffer();
339      }
340
341      protected void publishSyncOnRingBufferAndBlock(long sequence) {
342        try {
343          super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false));
344          Assert.fail("Expect an IOException here.");
345        } catch (IOException ignore) {
346          // Here, we will get an IOException.
347        }
348      }
349
350      @Override
351      protected Writer createWriterInstance(Path path) throws IOException {
352        final Writer w = super.createWriterInstance(path);
353        return new Writer() {
354          @Override
355          public void close() throws IOException {
356            w.close();
357          }
358
359          @Override
360          public void sync(boolean forceSync) throws IOException {
361            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
362          }
363
364          @Override
365          public void append(Entry entry) throws IOException {
366            w.append(entry);
367          }
368
369          @Override
370          public long getLength() {
371            return w.getLength();
372          }
373
374          @Override
375          public long getSyncedLength() {
376            return w.getSyncedLength();
377          }
378        };
379      }
380    }
381
382    // Mocked up server and regionserver services. Needed below.
383    RegionServerServices services = Mockito.mock(RegionServerServices.class);
384    Mockito.when(services.getConfiguration()).thenReturn(CONF);
385    Mockito.when(services.isStopped()).thenReturn(false);
386    Mockito.when(services.isAborted()).thenReturn(false);
387
388    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
389    FileSystem fs = FileSystem.get(CONF);
390    Path rootDir = new Path(dir + getName());
391    final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
392    // I need a log roller running.
393    LogRoller logRoller = new LogRoller(services);
394    logRoller.addWAL(dodgyWAL);
395    // There is no 'stop' once a logRoller is running.. it just dies.
396    logRoller.start();
397
398    try {
399      final long seqForSync = dodgyWAL.getSequenceOnRingBuffer();
400
401      // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
402      // in LogRoller.
403      // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
404      // as long as HBASE-14317 hasn't be fixed.
405      LOG.info("Trigger log roll for creating a ZigZagLatch.");
406      logRoller.requestRollAll();
407
408      while (!dodgyWAL.zigZagCreated) {
409        Thread.sleep(10);
410      }
411
412      // Send a sync event for RingBufferEventHandler,
413      // and it gets blocked in RingBufferEventHandler.attainSafePoint
414      LOG.info("Send sync for RingBufferEventHandler");
415      Thread syncThread = new Thread() {
416        @Override
417        public void run() {
418          dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync);
419        }
420      };
421      // Sync in another thread to avoid reset SyncFuture again.
422      syncThread.start();
423      syncThread.join();
424
425      try {
426        LOG.info("Call sync for testing whether RingBufferEventHandler is hanging.");
427        dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this test.
428        Assert.fail("Expect an IOException here.");
429      } catch (IOException ignore) {
430      }
431
432    } finally {
433      // To stop logRoller, its server has to say it is stopped.
434      Mockito.when(services.isStopped()).thenReturn(true);
435      if (logRoller != null) {
436        logRoller.close();
437      }
438      if (dodgyWAL != null) {
439        dodgyWAL.close();
440      }
441    }
442  }
443
444  /**
445   * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)}
446   *         when done.
447   */
448  private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
449      Configuration conf, WAL wal) throws IOException {
450    ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0,
451      0, null, MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
452    return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
453      Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
454  }
455}