001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver.wal;
019
020import static org.junit.Assert.assertThrows;
021import static org.junit.Assert.assertTrue;
022
023import java.io.FileNotFoundException;
024import java.io.IOException;
025import java.util.Arrays;
026import java.util.List;
027import java.util.NavigableMap;
028import java.util.TreeMap;
029import org.apache.hadoop.conf.Configuration;
030import org.apache.hadoop.fs.FileSystem;
031import org.apache.hadoop.fs.Path;
032import org.apache.hadoop.hbase.HBaseClassTestRule;
033import org.apache.hadoop.hbase.HBaseTestingUtil;
034import org.apache.hadoop.hbase.HConstants;
035import org.apache.hadoop.hbase.KeyValue;
036import org.apache.hadoop.hbase.ServerName;
037import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
038import org.apache.hadoop.hbase.TableName;
039import org.apache.hadoop.hbase.client.Admin;
040import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
041import org.apache.hadoop.hbase.client.Put;
042import org.apache.hadoop.hbase.client.RegionInfo;
043import org.apache.hadoop.hbase.client.RegionInfoBuilder;
044import org.apache.hadoop.hbase.client.Table;
045import org.apache.hadoop.hbase.client.TableDescriptor;
046import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
047import org.apache.hadoop.hbase.log.HBaseMarkers;
048import org.apache.hadoop.hbase.regionserver.HRegionServer;
049import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
050import org.apache.hadoop.hbase.testclassification.MediumTests;
051import org.apache.hadoop.hbase.testclassification.RegionServerTests;
052import org.apache.hadoop.hbase.util.Bytes;
053import org.apache.hadoop.hbase.util.CommonFSUtils;
054import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
055import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
056import org.apache.hadoop.hbase.wal.WAL;
057import org.apache.hadoop.hbase.wal.WALEdit;
058import org.apache.hadoop.hbase.wal.WALFactory;
059import org.apache.hadoop.hbase.wal.WALKeyImpl;
060import org.apache.hadoop.hbase.wal.WALProvider;
061import org.apache.hadoop.hbase.wal.WALSplitter;
062import org.apache.hadoop.hdfs.MiniDFSCluster;
063import org.apache.hadoop.ipc.RemoteException;
064import org.junit.After;
065import org.junit.Before;
066import org.junit.BeforeClass;
067import org.junit.ClassRule;
068import org.junit.Test;
069import org.junit.experimental.categories.Category;
070import org.junit.runner.RunWith;
071import org.junit.runners.Parameterized;
072import org.junit.runners.Parameterized.Parameter;
073import org.junit.runners.Parameterized.Parameters;
074import org.slf4j.Logger;
075import org.slf4j.LoggerFactory;
076
077import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
078
079/**
080 * Tests for conditions that should trigger RegionServer aborts when rolling the current WAL fails.
081 */
082@RunWith(Parameterized.class)
083@Category({ RegionServerTests.class, MediumTests.class })
084public class TestLogRollAbort {
085
086  @ClassRule
087  public static final HBaseClassTestRule CLASS_RULE =
088    HBaseClassTestRule.forClass(TestLogRollAbort.class);
089
090  private static final Logger LOG = LoggerFactory.getLogger(AbstractTestLogRolling.class);
091  private static MiniDFSCluster dfsCluster;
092  private static Admin admin;
093  private static SingleProcessHBaseCluster cluster;
094  protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
095
096  /* For the split-then-roll test */
097  private static final Path HBASEDIR = new Path("/hbase");
098  private static final Path HBASELOGDIR = new Path("/hbaselog");
099  private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME);
100
101  // Need to override this setup so we can edit the config before it gets sent
102  // to the HDFS & HBase cluster startup.
103  @BeforeClass
104  public static void setUpBeforeClass() throws Exception {
105    // Tweak default timeout values down for faster recovery
106    TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2);
107    TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000);
108
109    // Increase the amount of time between client retries
110    TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000);
111
112    // lower the namenode & datanode heartbeat so the namenode
113    // quickly detects datanode failures
114    TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000);
115    TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1);
116    // the namenode might still try to choose the recently-dead datanode
117    // for a pipeline, so try to a new pipeline multiple times
118    TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10);
119    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "asyncfs");
120  }
121
122  @Parameters(name = "{index}: walProvider={0}")
123  public static List<Object[]> params() {
124    return Arrays.asList(new Object[] { "filesystem" }, new Object[] { "asyncfs" });
125  }
126
127  private Configuration conf;
128  private FileSystem fs;
129
130  @Parameter
131  public String walProvider;
132
133  @Before
134  public void setUp() throws Exception {
135    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, walProvider);
136    TEST_UTIL.startMiniCluster(2);
137
138    cluster = TEST_UTIL.getHBaseCluster();
139    dfsCluster = TEST_UTIL.getDFSCluster();
140    admin = TEST_UTIL.getAdmin();
141    conf = TEST_UTIL.getConfiguration();
142    fs = TEST_UTIL.getDFSCluster().getFileSystem();
143
144    // disable region rebalancing (interferes with log watching)
145    cluster.getMaster().balanceSwitch(false);
146    CommonFSUtils.setRootDir(conf, HBASEDIR);
147    CommonFSUtils.setWALRootDir(conf, HBASELOGDIR);
148  }
149
150  @After
151  public void tearDown() throws Exception {
152    TEST_UTIL.shutdownMiniCluster();
153  }
154
155  /**
156   * Tests that RegionServer aborts if we hit an error closing the WAL when there are unsynced WAL
157   * edits. See HBASE-4282.
158   */
159  @Test
160  public void testRSAbortWithUnflushedEdits() throws Exception {
161    LOG.info("Starting testRSAbortWithUnflushedEdits()");
162
163    // When the hbase:meta table can be opened, the region servers are running
164    TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close();
165
166    // Create the test table and open it
167    TableName tableName = TableName.valueOf(this.getClass().getSimpleName());
168    TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
169      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build();
170
171    admin.createTable(desc);
172    Table table = TEST_UTIL.getConnection().getTable(tableName);
173    try {
174      HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName);
175      WAL log = server.getWAL(null);
176
177      Put p = new Put(Bytes.toBytes("row2001"));
178      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001));
179      table.put(p);
180
181      log.sync();
182
183      p = new Put(Bytes.toBytes("row2002"));
184      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002));
185      table.put(p);
186
187      dfsCluster.restartDataNodes();
188      LOG.info("Restarted datanodes");
189
190      try {
191        log.rollWriter(true);
192      } catch (FailedLogCloseException flce) {
193        // Expected exception. We used to expect that there would be unsynced appends but this
194        // not reliable now that sync plays a roll in wall rolling. The above puts also now call
195        // sync.
196      } catch (Throwable t) {
197        LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t);
198      }
199    } finally {
200      table.close();
201    }
202  }
203
204  /**
205   * Tests the case where a RegionServer enters a GC pause, comes back online after the master
206   * declared it dead and started to split. Want log rolling after a master split to fail. See
207   * HBASE-2312.
208   */
209  @Test
210  public void testLogRollAfterSplitStart() throws IOException {
211    LOG.info("Verify wal roll after split starts will fail.");
212    String logName =
213      ServerName.valueOf("testLogRollAfterSplitStart", 16010, EnvironmentEdgeManager.currentTime())
214        .toString();
215    Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName));
216    final WALFactory wals = new WALFactory(conf, logName);
217
218    try {
219      // put some entries in an WAL
220      TableName tableName = TableName.valueOf(this.getClass().getName());
221      RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build();
222      WAL log = wals.getWAL(regionInfo);
223      MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1);
224
225      int total = 20;
226      for (int i = 0; i < total; i++) {
227        WALEdit kvs = new WALEdit();
228        kvs.add(new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName()));
229        NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
230        scopes.put(Bytes.toBytes("column"), 0);
231        log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName,
232          EnvironmentEdgeManager.currentTime(), mvcc, scopes), kvs);
233      }
234      // Send the data to HDFS datanodes and close the HDFS writer
235      log.sync();
236      closeWriter((AbstractFSWAL<?>) log);
237
238      // code taken from MasterFileSystem.getLogDirs(), which is called from
239      // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process)
240      // rename the directory so a rogue RS doesn't create more WALs
241      Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
242      if (!fs.rename(thisTestsDir, rsSplitDir)) {
243        throw new IOException("Failed fs.rename for log split: " + thisTestsDir);
244      }
245      LOG.debug("Renamed region directory: " + rsSplitDir);
246
247      LOG.debug("Processing the old log files.");
248      WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals);
249
250      LOG.debug("Trying to roll the WAL.");
251      IOException error = assertThrows(IOException.class, () -> log.rollWriter());
252      if (error instanceof RemoteException) {
253        error = ((RemoteException) error).unwrapRemoteException();
254      }
255      assertTrue("unexpected error: " + Throwables.getStackTraceAsString(error),
256        error instanceof FileNotFoundException
257          || error.getCause() instanceof FileNotFoundException);
258    } finally {
259      wals.close();
260      if (fs.exists(thisTestsDir)) {
261        fs.delete(thisTestsDir, true);
262      }
263    }
264  }
265
266  private <W extends WALProvider.WriterBase> void closeWriter(AbstractFSWAL<W> wal) {
267    wal.waitForSafePoint();
268    long oldFileLen = wal.writer.getLength();
269    wal.closeWriter(wal.writer, wal.getOldPath());
270    wal.logRollAndSetupWalProps(wal.getOldPath(), null, oldFileLen);
271    wal.writer = null;
272    wal.onWriterReplaced(null);
273    wal.rollRequested.set(false);
274  }
275}