001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.Assert.assertTrue;
021
022import java.io.IOException;
023import java.util.NavigableMap;
024import java.util.TreeMap;
025import java.util.concurrent.CountDownLatch;
026import java.util.concurrent.TimeUnit;
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.fs.FileSystem;
029import org.apache.hadoop.fs.Path;
030import org.apache.hadoop.hbase.CellScanner;
031import org.apache.hadoop.hbase.ChoreService;
032import org.apache.hadoop.hbase.CoordinatedStateManager;
033import org.apache.hadoop.hbase.HBaseClassTestRule;
034import org.apache.hadoop.hbase.HBaseTestingUtility;
035import org.apache.hadoop.hbase.HConstants;
036import org.apache.hadoop.hbase.Server;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.TableName;
039import org.apache.hadoop.hbase.client.ClusterConnection;
040import org.apache.hadoop.hbase.client.Connection;
041import org.apache.hadoop.hbase.client.Durability;
042import org.apache.hadoop.hbase.client.Put;
043import org.apache.hadoop.hbase.regionserver.wal.DamagedWALException;
044import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
045import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
046import org.apache.hadoop.hbase.testclassification.MediumTests;
047import org.apache.hadoop.hbase.testclassification.RegionServerTests;
048import org.apache.hadoop.hbase.util.Bytes;
049import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
050import org.apache.hadoop.hbase.util.Threads;
051import org.apache.hadoop.hbase.wal.WAL;
052import org.apache.hadoop.hbase.wal.WALEdit;
053import org.apache.hadoop.hbase.wal.WALKey;
054import org.apache.hadoop.hbase.wal.WALKeyImpl;
055import org.apache.hadoop.hbase.wal.WALProvider.Writer;
056import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
057import org.junit.After;
058import org.junit.Assert;
059import org.junit.Before;
060import org.junit.ClassRule;
061import org.junit.Rule;
062import org.junit.Test;
063import org.junit.experimental.categories.Category;
064import org.junit.rules.TestName;
065import org.mockito.Mockito;
066import org.slf4j.Logger;
067import org.slf4j.LoggerFactory;
068
069import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
070
071/**
072 * Testing for lock up of FSHLog.
073 */
074@Category({ RegionServerTests.class, MediumTests.class })
075public class TestWALLockup {
076
077  @ClassRule
078  public static final HBaseClassTestRule CLASS_RULE =
079      HBaseClassTestRule.forClass(TestWALLockup.class);
080
081  private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class);
082
083  @Rule
084  public TestName name = new TestName();
085
086  private static final String COLUMN_FAMILY = "MyCF";
087  private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
088
089  HRegion region = null;
090  private static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
091  private static Configuration CONF ;
092  private String dir;
093
094  // Test names
095  protected TableName tableName;
096
097  @Before
098  public void setup() throws IOException {
099    CONF = TEST_UTIL.getConfiguration();
100    // Disable block cache.
101    CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
102    dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
103    tableName = TableName.valueOf(name.getMethodName());
104  }
105
106  @After
107  public void tearDown() throws Exception {
108    EnvironmentEdgeManagerTestHelper.reset();
109    LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
110    TEST_UTIL.cleanupTestDir();
111  }
112
113  private String getName() {
114    return name.getMethodName();
115  }
116
117  // A WAL that we can have throw exceptions when a flag is set.
118  private static final class DodgyFSLog extends FSHLog {
119    // Set this when want the WAL to start throwing exceptions.
120    volatile boolean throwException = false;
121
122    // Latch to hold up processing until after another operation has had time to run.
123    CountDownLatch latch = new CountDownLatch(1);
124
125    public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
126        throws IOException {
127      super(fs, root, logDir, conf);
128    }
129
130    @Override
131    protected void afterCreatingZigZagLatch() {
132      // If throwException set, then append will throw an exception causing the WAL to be
133      // rolled. We'll come in here. Hold up processing until a sync can get in before
134      // the zigzag has time to complete its setup and get its own sync in. This is what causes
135      // the lock up we've seen in production.
136      if (throwException) {
137        try {
138          LOG.info("LATCHED");
139          // So, timing can have it that the test can run and the bad flush below happens
140          // before we get here. In this case, we'll be stuck waiting on this latch but there
141          // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
142          // because all WALs have rolled. In this case, just give up on test.
143          if (!this.latch.await(5, TimeUnit.SECONDS)) {
144            LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
145          }
146        } catch (InterruptedException e) {
147        }
148      }
149    }
150
151    @Override
152    protected void beforeWaitOnSafePoint() {
153      if (throwException) {
154        LOG.info("COUNTDOWN");
155        // Don't countdown latch until someone waiting on it otherwise, the above
156        // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
157        // be stuck; test won't go down
158        while (this.latch.getCount() <= 0)
159          Threads.sleep(1);
160        this.latch.countDown();
161      }
162    }
163
164    @Override
165    protected Writer createWriterInstance(Path path) throws IOException {
166      final Writer w = super.createWriterInstance(path);
167      return new Writer() {
168        @Override
169        public void close() throws IOException {
170          w.close();
171        }
172
173        @Override
174        public void sync(boolean forceSync) throws IOException {
175          if (throwException) {
176            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
177          }
178          w.sync(forceSync);
179        }
180
181        @Override
182        public void append(Entry entry) throws IOException {
183          if (throwException) {
184            throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
185          }
186          w.append(entry);
187        }
188
189        @Override
190        public long getLength() {
191          return w.getLength();
192        }
193
194        @Override
195        public long getSyncedLength() {
196          return w.getSyncedLength();
197        }
198      };
199    }
200  }
201
202  /**
203   * Reproduce locking up that happens when we get an inopportune sync during setup for
204   * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
205   * it is locked up.
206   * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
207   * set up a dodgy WAL that will throw an exception when we go to append to it.
208   */
209  @Test
210  public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
211    // Mocked up server and regionserver services. Needed below.
212    RegionServerServices services = Mockito.mock(RegionServerServices.class);
213    Mockito.when(services.getConfiguration()).thenReturn(CONF);
214    Mockito.when(services.isStopped()).thenReturn(false);
215    Mockito.when(services.isAborted()).thenReturn(false);
216
217    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
218    FileSystem fs = FileSystem.get(CONF);
219    Path rootDir = new Path(dir + getName());
220    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
221    dodgyWAL.init();
222    Path originalWAL = dodgyWAL.getCurrentFileName();
223    // I need a log roller running.
224    LogRoller logRoller = new LogRoller(services);
225    logRoller.addWAL(dodgyWAL);
226    // There is no 'stop' once a logRoller is running.. it just dies.
227    logRoller.start();
228    // Now get a region and start adding in edits.
229    final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
230    byte [] bytes = Bytes.toBytes(getName());
231    NavigableMap<byte[], Integer> scopes = new TreeMap<>(
232        Bytes.BYTES_COMPARATOR);
233    scopes.put(COLUMN_FAMILY_BYTES, 0);
234    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
235    try {
236      // First get something into memstore. Make a Put and then pull the Cell out of it. Will
237      // manage append and sync carefully in below to manufacture hang. We keep adding same
238      // edit. WAL subsystem doesn't care.
239      Put put = new Put(bytes);
240      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
241      WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(),
242        TableName.META_TABLE_NAME, System.currentTimeMillis(), mvcc, scopes);
243      WALEdit edit = new WALEdit();
244      CellScanner CellScanner = put.cellScanner();
245      assertTrue(CellScanner.advance());
246      edit.add(CellScanner.current());
247      // Put something in memstore and out in the WAL. Do a big number of appends so we push
248      // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
249      for (int i = 0; i < 1000; i++) {
250        region.put(put);
251      }
252      // Set it so we start throwing exceptions.
253      LOG.info("SET throwing of exception on append");
254      dodgyWAL.throwException = true;
255      // This append provokes a WAL roll request
256      dodgyWAL.appendData(region.getRegionInfo(), key, edit);
257      boolean exception = false;
258      try {
259        dodgyWAL.sync(false);
260      } catch (Exception e) {
261        exception = true;
262      }
263      assertTrue("Did not get sync exception", exception);
264
265      // Get a memstore flush going too so we have same hung profile as up in the issue over
266      // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
267      // by the zigzaglatch waiting on syncs to come home.
268      Thread t = new Thread ("Flusher") {
269        @Override
270        public void run() {
271          try {
272            if (region.getMemStoreDataSize() <= 0) {
273              throw new IOException("memstore size=" + region.getMemStoreDataSize());
274            }
275            region.flush(false);
276          } catch (IOException e) {
277            // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
278            // when roll completes.
279            LOG.info("In flush", e);
280          }
281          LOG.info("Exiting");
282        };
283      };
284      t.setDaemon(true);
285      t.start();
286      // Wait until
287      while (dodgyWAL.latch.getCount() > 0) {
288        Threads.sleep(1);
289      }
290      // Now assert I got a new WAL file put in place even though loads of errors above.
291      assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
292      // Can I append to it?
293      dodgyWAL.throwException = false;
294      try {
295        region.put(put);
296      } catch (Exception e) {
297        LOG.info("In the put", e);
298      }
299    } finally {
300      // To stop logRoller, its server has to say it is stopped.
301      Mockito.when(services.isStopped()).thenReturn(true);
302      Closeables.close(logRoller, true);
303      try {
304        if (region != null) {
305          region.close();
306        }
307        if (dodgyWAL != null) {
308          dodgyWAL.close();
309        }
310      } catch (Exception e) {
311        LOG.info("On way out", e);
312      }
313    }
314  }
315
316  /**
317   *
318   * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
319   * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
320   */
321  @Test
322  public void testRingBufferEventHandlerStuckWhenSyncFailed()
323    throws IOException, InterruptedException {
324
325    // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
326    class DodgyFSLog extends FSHLog {
327
328      private volatile boolean zigZagCreated = false;
329
330      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
331        throws IOException {
332        super(fs, root, logDir, conf);
333      }
334
335      @Override
336      protected void afterCreatingZigZagLatch() {
337        zigZagCreated = true;
338        // Sleep a while to wait for RingBufferEventHandler to get stuck first.
339        try {
340          Thread.sleep(3000);
341        } catch (InterruptedException ignore) {
342        }
343      }
344
345      @Override
346      protected long getSequenceOnRingBuffer() {
347        return super.getSequenceOnRingBuffer();
348      }
349
350      protected void publishSyncOnRingBufferAndBlock(long sequence) {
351        try {
352          super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false));
353          Assert.fail("Expect an IOException here.");
354        } catch (IOException ignore) {
355          // Here, we will get an IOException.
356        }
357      }
358
359      @Override
360      protected Writer createWriterInstance(Path path) throws IOException {
361        final Writer w = super.createWriterInstance(path);
362        return new Writer() {
363          @Override
364          public void close() throws IOException {
365            w.close();
366          }
367
368          @Override
369          public void sync(boolean forceSync) throws IOException {
370            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
371          }
372
373          @Override
374          public void append(Entry entry) throws IOException {
375            w.append(entry);
376          }
377
378          @Override
379          public long getLength() {
380            return w.getLength();
381          }
382
383          @Override
384          public long getSyncedLength() {
385            return w.getSyncedLength();
386          }
387        };
388      }
389    }
390
391    // Mocked up server and regionserver services. Needed below.
392    RegionServerServices services = Mockito.mock(RegionServerServices.class);
393    Mockito.when(services.getConfiguration()).thenReturn(CONF);
394    Mockito.when(services.isStopped()).thenReturn(false);
395    Mockito.when(services.isAborted()).thenReturn(false);
396
397    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
398    FileSystem fs = FileSystem.get(CONF);
399    Path rootDir = new Path(dir + getName());
400    final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
401    dodgyWAL.init();
402    // I need a log roller running.
403    LogRoller logRoller = new LogRoller(services);
404    logRoller.addWAL(dodgyWAL);
405    // There is no 'stop' once a logRoller is running.. it just dies.
406    logRoller.start();
407
408    try {
409      final long seqForSync = dodgyWAL.getSequenceOnRingBuffer();
410
411      // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
412      // in LogRoller.
413      // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
414      // as long as HBASE-14317 hasn't be fixed.
415      LOG.info("Trigger log roll for creating a ZigZagLatch.");
416      logRoller.requestRollAll();
417
418      while (!dodgyWAL.zigZagCreated) {
419        Thread.sleep(10);
420      }
421
422      // Send a sync event for RingBufferEventHandler,
423      // and it gets blocked in RingBufferEventHandler.attainSafePoint
424      LOG.info("Send sync for RingBufferEventHandler");
425      Thread syncThread = new Thread() {
426        @Override
427        public void run() {
428          dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync);
429        }
430      };
431      // Sync in another thread to avoid reset SyncFuture again.
432      syncThread.start();
433      syncThread.join();
434
435      try {
436        LOG.info("Call sync for testing whether RingBufferEventHandler is hanging.");
437        dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this test.
438        Assert.fail("Expect an IOException here.");
439      } catch (IOException ignore) {
440      }
441
442    } finally {
443      // To stop logRoller, its server has to say it is stopped.
444      Mockito.when(services.isStopped()).thenReturn(true);
445      if (logRoller != null) {
446        logRoller.close();
447      }
448      if (dodgyWAL != null) {
449        dodgyWAL.close();
450      }
451    }
452  }
453
454
455  static class DummyServer implements Server {
456    private Configuration conf;
457    private String serverName;
458    private boolean isAborted = false;
459
460    public DummyServer(Configuration conf, String serverName) {
461      this.conf = conf;
462      this.serverName = serverName;
463    }
464
465    @Override
466    public Configuration getConfiguration() {
467      return conf;
468    }
469
470    @Override
471    public ZKWatcher getZooKeeper() {
472      return null;
473    }
474
475    @Override
476    public CoordinatedStateManager getCoordinatedStateManager() {
477      return null;
478    }
479
480    @Override
481    public ClusterConnection getConnection() {
482      return null;
483    }
484
485    @Override
486    public ServerName getServerName() {
487      return ServerName.valueOf(this.serverName);
488    }
489
490    @Override
491    public void abort(String why, Throwable e) {
492      LOG.info("Aborting " + serverName);
493      this.isAborted = true;
494    }
495
496    @Override
497    public boolean isAborted() {
498      return this.isAborted;
499    }
500
501    @Override
502    public void stop(String why) {
503      this.isAborted = true;
504    }
505
506    @Override
507    public boolean isStopped() {
508      return this.isAborted;
509    }
510
511    @Override
512    public ChoreService getChoreService() {
513      return null;
514    }
515
516    @Override
517    public ClusterConnection getClusterConnection() {
518      return null;
519    }
520
521    @Override
522    public FileSystem getFileSystem() {
523      return null;
524    }
525
526    @Override
527    public boolean isStopping() {
528      return false;
529    }
530
531    @Override
532    public Connection createConnection(Configuration conf) throws IOException {
533      return null;
534    }
535  }
536
537  static class DummyWALActionsListener implements WALActionsListener {
538
539    @Override
540    public void visitLogEntryBeforeWrite(WALKey logKey, WALEdit logEdit)
541        throws IOException {
542      if (logKey.getTableName().getNameAsString().equalsIgnoreCase("sleep")) {
543        try {
544          Thread.sleep(1000);
545        } catch (InterruptedException e) {
546          e.printStackTrace();
547        }
548      }
549      if (logKey.getTableName().getNameAsString()
550          .equalsIgnoreCase("DamagedWALException")) {
551        throw new DamagedWALException("Failed appending");
552      }
553    }
554
555  }
556
557  /**
558   * @return A region on which you must call {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)}
559   *         when done.
560   */
561  private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
562      Configuration conf, WAL wal) throws IOException {
563    ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0,
564      0, null, MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
565    return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
566      Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
567  }
568}