001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.Assert.assertTrue;
021
022import java.io.IOException;
023import java.util.NavigableMap;
024import java.util.TreeMap;
025import java.util.concurrent.CountDownLatch;
026import java.util.concurrent.TimeUnit;
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.fs.FileSystem;
029import org.apache.hadoop.fs.Path;
030import org.apache.hadoop.hbase.CellScanner;
031import org.apache.hadoop.hbase.ChoreService;
032import org.apache.hadoop.hbase.CoordinatedStateManager;
033import org.apache.hadoop.hbase.HBaseClassTestRule;
034import org.apache.hadoop.hbase.HBaseTestingUtility;
035import org.apache.hadoop.hbase.HConstants;
036import org.apache.hadoop.hbase.Server;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.TableName;
039import org.apache.hadoop.hbase.client.ClusterConnection;
040import org.apache.hadoop.hbase.client.Connection;
041import org.apache.hadoop.hbase.client.Durability;
042import org.apache.hadoop.hbase.client.Put;
043import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
044import org.apache.hadoop.hbase.testclassification.MediumTests;
045import org.apache.hadoop.hbase.testclassification.RegionServerTests;
046import org.apache.hadoop.hbase.util.Bytes;
047import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
048import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
049import org.apache.hadoop.hbase.util.Threads;
050import org.apache.hadoop.hbase.wal.WAL;
051import org.apache.hadoop.hbase.wal.WALEdit;
052import org.apache.hadoop.hbase.wal.WALKeyImpl;
053import org.apache.hadoop.hbase.wal.WALProvider.Writer;
054import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
055import org.junit.After;
056import org.junit.Assert;
057import org.junit.Before;
058import org.junit.ClassRule;
059import org.junit.Rule;
060import org.junit.Test;
061import org.junit.experimental.categories.Category;
062import org.junit.rules.TestName;
063import org.mockito.Mockito;
064import org.slf4j.Logger;
065import org.slf4j.LoggerFactory;
066
067import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
068
069/**
070 * Testing for lock up of FSHLog.
071 */
072@Category({ RegionServerTests.class, MediumTests.class })
073public class TestWALLockup {
074
075  @ClassRule
076  public static final HBaseClassTestRule CLASS_RULE =
077    HBaseClassTestRule.forClass(TestWALLockup.class);
078
079  private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class);
080
081  @Rule
082  public TestName name = new TestName();
083
084  private static final String COLUMN_FAMILY = "MyCF";
085  private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
086
087  HRegion region = null;
088  private static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
089  private static Configuration CONF;
090  private String dir;
091
092  // Test names
093  protected TableName tableName;
094
095  @Before
096  public void setup() throws IOException {
097    CONF = TEST_UTIL.getConfiguration();
098    // Disable block cache.
099    CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
100    dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
101    tableName = TableName.valueOf(name.getMethodName());
102  }
103
104  @After
105  public void tearDown() throws Exception {
106    EnvironmentEdgeManagerTestHelper.reset();
107    LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
108    TEST_UTIL.cleanupTestDir();
109  }
110
111  private String getName() {
112    return name.getMethodName();
113  }
114
115  // A WAL that we can have throw exceptions when a flag is set.
116  private static final class DodgyFSLog extends FSHLog {
117    // Set this when want the WAL to start throwing exceptions.
118    volatile boolean throwException = false;
119
120    // Latch to hold up processing until after another operation has had time to run.
121    CountDownLatch latch = new CountDownLatch(1);
122
123    public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
124      throws IOException {
125      super(fs, root, logDir, conf);
126    }
127
128    @Override
129    protected void afterCreatingZigZagLatch() {
130      // If throwException set, then append will throw an exception causing the WAL to be
131      // rolled. We'll come in here. Hold up processing until a sync can get in before
132      // the zigzag has time to complete its setup and get its own sync in. This is what causes
133      // the lock up we've seen in production.
134      if (throwException) {
135        try {
136          LOG.info("LATCHED");
137          // So, timing can have it that the test can run and the bad flush below happens
138          // before we get here. In this case, we'll be stuck waiting on this latch but there
139          // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
140          // because all WALs have rolled. In this case, just give up on test.
141          if (!this.latch.await(5, TimeUnit.SECONDS)) {
142            LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
143          }
144        } catch (InterruptedException e) {
145        }
146      }
147    }
148
149    @Override
150    protected void beforeWaitOnSafePoint() {
151      if (throwException) {
152        LOG.info("COUNTDOWN");
153        // Don't countdown latch until someone waiting on it otherwise, the above
154        // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
155        // be stuck; test won't go down
156        while (this.latch.getCount() <= 0)
157          Threads.sleep(1);
158        this.latch.countDown();
159      }
160    }
161
162    @Override
163    protected Writer createWriterInstance(Path path) throws IOException {
164      final Writer w = super.createWriterInstance(path);
165      return new Writer() {
166        @Override
167        public void close() throws IOException {
168          w.close();
169        }
170
171        @Override
172        public void sync(boolean forceSync) throws IOException {
173          if (throwException) {
174            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
175          }
176          w.sync(forceSync);
177        }
178
179        @Override
180        public void append(Entry entry) throws IOException {
181          if (throwException) {
182            throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
183          }
184          w.append(entry);
185        }
186
187        @Override
188        public long getLength() {
189          return w.getLength();
190        }
191
192        @Override
193        public long getSyncedLength() {
194          return w.getSyncedLength();
195        }
196      };
197    }
198  }
199
200  /**
201   * Reproduce locking up that happens when we get an inopportune sync during setup for zigzaglatch
202   * wait. See HBASE-14317. If below is broken, we will see this test timeout because it is locked
203   * up.
204   * <p>
205   * First I need to set up some mocks for Server and RegionServerServices. I also need to set up a
206   * dodgy WAL that will throw an exception when we go to append to it.
207   */
208  @Test
209  public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
210    // Mocked up server and regionserver services. Needed below.
211    RegionServerServices services = Mockito.mock(RegionServerServices.class);
212    Mockito.when(services.getConfiguration()).thenReturn(CONF);
213    Mockito.when(services.isStopped()).thenReturn(false);
214    Mockito.when(services.isAborted()).thenReturn(false);
215
216    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
217    FileSystem fs = FileSystem.get(CONF);
218    Path rootDir = new Path(dir + getName());
219    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
220    dodgyWAL.init();
221    Path originalWAL = dodgyWAL.getCurrentFileName();
222    // I need a log roller running.
223    LogRoller logRoller = new LogRoller(services);
224    logRoller.addWAL(dodgyWAL);
225    // There is no 'stop' once a logRoller is running.. it just dies.
226    logRoller.start();
227    // Now get a region and start adding in edits.
228    final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
229    byte[] bytes = Bytes.toBytes(getName());
230    NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
231    scopes.put(COLUMN_FAMILY_BYTES, 0);
232    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
233    try {
234      // First get something into memstore. Make a Put and then pull the Cell out of it. Will
235      // manage append and sync carefully in below to manufacture hang. We keep adding same
236      // edit. WAL subsystem doesn't care.
237      Put put = new Put(bytes);
238      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
239      WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(),
240        TableName.META_TABLE_NAME, EnvironmentEdgeManager.currentTime(), mvcc, scopes);
241      WALEdit edit = new WALEdit();
242      CellScanner CellScanner = put.cellScanner();
243      assertTrue(CellScanner.advance());
244      edit.add(CellScanner.current());
245      // Put something in memstore and out in the WAL. Do a big number of appends so we push
246      // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
247      for (int i = 0; i < 1000; i++) {
248        region.put(put);
249      }
250      // Set it so we start throwing exceptions.
251      LOG.info("SET throwing of exception on append");
252      dodgyWAL.throwException = true;
253      // This append provokes a WAL roll request
254      dodgyWAL.appendData(region.getRegionInfo(), key, edit);
255      boolean exception = false;
256      try {
257        dodgyWAL.sync(false);
258      } catch (Exception e) {
259        exception = true;
260      }
261      assertTrue("Did not get sync exception", exception);
262
263      // Get a memstore flush going too so we have same hung profile as up in the issue over
264      // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
265      // by the zigzaglatch waiting on syncs to come home.
266      Thread t = new Thread("Flusher") {
267        @Override
268        public void run() {
269          try {
270            if (region.getMemStoreDataSize() <= 0) {
271              throw new IOException("memstore size=" + region.getMemStoreDataSize());
272            }
273            region.flush(false);
274          } catch (IOException e) {
275            // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
276            // when roll completes.
277            LOG.info("In flush", e);
278          }
279          LOG.info("Exiting");
280        };
281      };
282      t.setDaemon(true);
283      t.start();
284      // Wait until
285      while (dodgyWAL.latch.getCount() > 0) {
286        Threads.sleep(1);
287      }
288      // Now assert I got a new WAL file put in place even though loads of errors above.
289      assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
290      // Can I append to it?
291      dodgyWAL.throwException = false;
292      try {
293        region.put(put);
294      } catch (Exception e) {
295        LOG.info("In the put", e);
296      }
297    } finally {
298      // To stop logRoller, its server has to say it is stopped.
299      Mockito.when(services.isStopped()).thenReturn(true);
300      Closeables.close(logRoller, true);
301      try {
302        if (region != null) {
303          region.close();
304        }
305        if (dodgyWAL != null) {
306          dodgyWAL.close();
307        }
308      } catch (Exception e) {
309        LOG.info("On way out", e);
310      }
311    }
312  }
313
314  /**
315   * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
316   * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
317   */
318  @Test
319  public void testRingBufferEventHandlerStuckWhenSyncFailed()
320    throws IOException, InterruptedException {
321
322    // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
323    class DodgyFSLog extends FSHLog {
324
325      private volatile boolean zigZagCreated = false;
326
327      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
328        throws IOException {
329        super(fs, root, logDir, conf);
330      }
331
332      @Override
333      protected void afterCreatingZigZagLatch() {
334        zigZagCreated = true;
335        // Sleep a while to wait for RingBufferEventHandler to get stuck first.
336        try {
337          Thread.sleep(3000);
338        } catch (InterruptedException ignore) {
339        }
340      }
341
342      @Override
343      protected long getSequenceOnRingBuffer() {
344        return super.getSequenceOnRingBuffer();
345      }
346
347      protected void publishSyncOnRingBufferAndBlock(long sequence) {
348        try {
349          super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false));
350          Assert.fail("Expect an IOException here.");
351        } catch (IOException ignore) {
352          // Here, we will get an IOException.
353        }
354      }
355
356      @Override
357      protected Writer createWriterInstance(Path path) throws IOException {
358        final Writer w = super.createWriterInstance(path);
359        return new Writer() {
360          @Override
361          public void close() throws IOException {
362            w.close();
363          }
364
365          @Override
366          public void sync(boolean forceSync) throws IOException {
367            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
368          }
369
370          @Override
371          public void append(Entry entry) throws IOException {
372            w.append(entry);
373          }
374
375          @Override
376          public long getLength() {
377            return w.getLength();
378          }
379
380          @Override
381          public long getSyncedLength() {
382            return w.getSyncedLength();
383          }
384        };
385      }
386    }
387
388    // Mocked up server and regionserver services. Needed below.
389    RegionServerServices services = Mockito.mock(RegionServerServices.class);
390    Mockito.when(services.getConfiguration()).thenReturn(CONF);
391    Mockito.when(services.isStopped()).thenReturn(false);
392    Mockito.when(services.isAborted()).thenReturn(false);
393
394    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
395    FileSystem fs = FileSystem.get(CONF);
396    Path rootDir = new Path(dir + getName());
397    final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
398    dodgyWAL.init();
399    // I need a log roller running.
400    LogRoller logRoller = new LogRoller(services);
401    logRoller.addWAL(dodgyWAL);
402    // There is no 'stop' once a logRoller is running.. it just dies.
403    logRoller.start();
404
405    try {
406      final long seqForSync = dodgyWAL.getSequenceOnRingBuffer();
407
408      // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
409      // in LogRoller.
410      // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
411      // as long as HBASE-14317 hasn't be fixed.
412      LOG.info("Trigger log roll for creating a ZigZagLatch.");
413      logRoller.requestRollAll();
414
415      while (!dodgyWAL.zigZagCreated) {
416        Thread.sleep(10);
417      }
418
419      // Send a sync event for RingBufferEventHandler,
420      // and it gets blocked in RingBufferEventHandler.attainSafePoint
421      LOG.info("Send sync for RingBufferEventHandler");
422      Thread syncThread = new Thread() {
423        @Override
424        public void run() {
425          dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync);
426        }
427      };
428      // Sync in another thread to avoid reset SyncFuture again.
429      syncThread.start();
430      syncThread.join();
431
432      try {
433        LOG.info("Call sync for testing whether RingBufferEventHandler is hanging.");
434        dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this
435                              // test.
436        Assert.fail("Expect an IOException here.");
437      } catch (IOException ignore) {
438      }
439
440    } finally {
441      // To stop logRoller, its server has to say it is stopped.
442      Mockito.when(services.isStopped()).thenReturn(true);
443      if (logRoller != null) {
444        logRoller.close();
445      }
446      if (dodgyWAL != null) {
447        dodgyWAL.close();
448      }
449    }
450  }
451
452  static class DummyServer implements Server {
453    private Configuration conf;
454    private String serverName;
455    private boolean isAborted = false;
456
457    public DummyServer(Configuration conf, String serverName) {
458      this.conf = conf;
459      this.serverName = serverName;
460    }
461
462    @Override
463    public Configuration getConfiguration() {
464      return conf;
465    }
466
467    @Override
468    public ZKWatcher getZooKeeper() {
469      return null;
470    }
471
472    @Override
473    public CoordinatedStateManager getCoordinatedStateManager() {
474      return null;
475    }
476
477    @Override
478    public ClusterConnection getConnection() {
479      return null;
480    }
481
482    @Override
483    public ServerName getServerName() {
484      return ServerName.valueOf(this.serverName);
485    }
486
487    @Override
488    public void abort(String why, Throwable e) {
489      LOG.info("Aborting " + serverName);
490      this.isAborted = true;
491    }
492
493    @Override
494    public boolean isAborted() {
495      return this.isAborted;
496    }
497
498    @Override
499    public void stop(String why) {
500      this.isAborted = true;
501    }
502
503    @Override
504    public boolean isStopped() {
505      return this.isAborted;
506    }
507
508    @Override
509    public ChoreService getChoreService() {
510      return null;
511    }
512
513    @Override
514    public ClusterConnection getClusterConnection() {
515      return null;
516    }
517
518    @Override
519    public FileSystem getFileSystem() {
520      return null;
521    }
522
523    @Override
524    public boolean isStopping() {
525      return false;
526    }
527
528    @Override
529    public Connection createConnection(Configuration conf) throws IOException {
530      return null;
531    }
532  }
533
534  /**
535   * @return A region on which you must call {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)}
536   *         when done.
537   */
538  private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
539    Configuration conf, WAL wal) throws IOException {
540    ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null,
541      MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
542    return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
543      Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
544  }
545}