001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver.wal;
019
020import static org.junit.jupiter.api.Assertions.assertThrows;
021import static org.junit.jupiter.api.Assertions.assertTrue;
022
023import java.io.FileNotFoundException;
024import java.io.IOException;
025import java.util.NavigableMap;
026import java.util.TreeMap;
027import java.util.stream.Stream;
028import org.apache.hadoop.conf.Configuration;
029import org.apache.hadoop.fs.FileSystem;
030import org.apache.hadoop.fs.Path;
031import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate;
032import org.apache.hadoop.hbase.HBaseTestingUtil;
033import org.apache.hadoop.hbase.HConstants;
034import org.apache.hadoop.hbase.KeyValue;
035import org.apache.hadoop.hbase.ServerName;
036import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
037import org.apache.hadoop.hbase.TableName;
038import org.apache.hadoop.hbase.client.Admin;
039import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
040import org.apache.hadoop.hbase.client.Put;
041import org.apache.hadoop.hbase.client.RegionInfo;
042import org.apache.hadoop.hbase.client.RegionInfoBuilder;
043import org.apache.hadoop.hbase.client.Table;
044import org.apache.hadoop.hbase.client.TableDescriptor;
045import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
046import org.apache.hadoop.hbase.log.HBaseMarkers;
047import org.apache.hadoop.hbase.regionserver.HRegionServer;
048import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
049import org.apache.hadoop.hbase.testclassification.MediumTests;
050import org.apache.hadoop.hbase.testclassification.RegionServerTests;
051import org.apache.hadoop.hbase.util.Bytes;
052import org.apache.hadoop.hbase.util.CommonFSUtils;
053import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
054import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
055import org.apache.hadoop.hbase.wal.WAL;
056import org.apache.hadoop.hbase.wal.WALEdit;
057import org.apache.hadoop.hbase.wal.WALEditInternalHelper;
058import org.apache.hadoop.hbase.wal.WALFactory;
059import org.apache.hadoop.hbase.wal.WALKeyImpl;
060import org.apache.hadoop.hbase.wal.WALProvider;
061import org.apache.hadoop.hbase.wal.WALSplitter;
062import org.apache.hadoop.hdfs.MiniDFSCluster;
063import org.apache.hadoop.ipc.RemoteException;
064import org.junit.jupiter.api.AfterEach;
065import org.junit.jupiter.api.BeforeAll;
066import org.junit.jupiter.api.BeforeEach;
067import org.junit.jupiter.api.Tag;
068import org.junit.jupiter.api.TestTemplate;
069import org.junit.jupiter.params.provider.Arguments;
070import org.slf4j.Logger;
071import org.slf4j.LoggerFactory;
072
073import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
074
075/**
076 * Tests for conditions that should trigger RegionServer aborts when rolling the current WAL fails.
077 */
078@Tag(RegionServerTests.TAG)
079@Tag(MediumTests.TAG)
080@HBaseParameterizedTestTemplate(name = "{index}: walProvider={0}")
081public class TestLogRollAbort {
082
083  private static final Logger LOG = LoggerFactory.getLogger(TestLogRollAbort.class);
084  private static MiniDFSCluster dfsCluster;
085  private static Admin admin;
086  private static SingleProcessHBaseCluster cluster;
087  protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
088
089  /* For the split-then-roll test */
090  private static final Path HBASEDIR = new Path("/hbase");
091  private static final Path HBASELOGDIR = new Path("/hbaselog");
092  private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME);
093
094  // Need to override this setup so we can edit the config before it gets sent
095  // to the HDFS & HBase cluster startup.
096  @BeforeAll
097  public static void setUpBeforeClass() throws Exception {
098    // Tweak default timeout values down for faster recovery
099    TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2);
100    TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000);
101
102    // Increase the amount of time between client retries
103    TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000);
104
105    // lower the namenode & datanode heartbeat so the namenode
106    // quickly detects datanode failures
107    TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000);
108    TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1);
109    // the namenode might still try to choose the recently-dead datanode
110    // for a pipeline, so try to a new pipeline multiple times
111    TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10);
112    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "asyncfs");
113  }
114
115  public static Stream<Arguments> parameters() {
116    return Stream.of(Arguments.of("filesystem"), Arguments.of("asyncfs"));
117  }
118
119  private Configuration conf;
120  private FileSystem fs;
121
122  private final String walProvider;
123
124  public TestLogRollAbort(String walProvider) {
125    this.walProvider = walProvider;
126  }
127
128  @BeforeEach
129  public void setUp() throws Exception {
130    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, walProvider);
131    TEST_UTIL.startMiniCluster(2);
132
133    cluster = TEST_UTIL.getHBaseCluster();
134    dfsCluster = TEST_UTIL.getDFSCluster();
135    admin = TEST_UTIL.getAdmin();
136    conf = TEST_UTIL.getConfiguration();
137    fs = TEST_UTIL.getDFSCluster().getFileSystem();
138
139    // disable region rebalancing (interferes with log watching)
140    cluster.getMaster().balanceSwitch(false);
141    CommonFSUtils.setRootDir(conf, HBASEDIR);
142    CommonFSUtils.setWALRootDir(conf, HBASELOGDIR);
143  }
144
145  @AfterEach
146  public void tearDown() throws Exception {
147    TEST_UTIL.shutdownMiniCluster();
148  }
149
150  /**
151   * Tests that RegionServer aborts if we hit an error closing the WAL when there are unsynced WAL
152   * edits. See HBASE-4282.
153   */
154  @TestTemplate
155  public void testRSAbortWithUnflushedEdits() throws Exception {
156    LOG.info("Starting testRSAbortWithUnflushedEdits()");
157
158    // When the hbase:meta table can be opened, the region servers are running
159    TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close();
160
161    // Create the test table and open it
162    TableName tableName = TableName.valueOf(this.getClass().getSimpleName());
163    TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
164      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build();
165
166    admin.createTable(desc);
167    Table table = TEST_UTIL.getConnection().getTable(tableName);
168    try {
169      HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName);
170      WAL log = server.getWAL(null);
171
172      Put p = new Put(Bytes.toBytes("row2001"));
173      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001));
174      table.put(p);
175
176      log.sync();
177
178      p = new Put(Bytes.toBytes("row2002"));
179      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002));
180      table.put(p);
181
182      dfsCluster.restartDataNodes();
183      LOG.info("Restarted datanodes");
184
185      try {
186        log.rollWriter(true);
187      } catch (FailedLogCloseException flce) {
188        // Expected exception. We used to expect that there would be unsynced appends but this
189        // not reliable now that sync plays a roll in wall rolling. The above puts also now call
190        // sync.
191      } catch (Throwable t) {
192        LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t);
193      }
194    } finally {
195      table.close();
196    }
197  }
198
199  /**
200   * Tests the case where a RegionServer enters a GC pause, comes back online after the master
201   * declared it dead and started to split. Want log rolling after a master split to fail. See
202   * HBASE-2312.
203   */
204  @TestTemplate
205  public void testLogRollAfterSplitStart() throws IOException {
206    LOG.info("Verify wal roll after split starts will fail.");
207    String logName =
208      ServerName.valueOf("testLogRollAfterSplitStart", 16010, EnvironmentEdgeManager.currentTime())
209        .toString();
210    Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName));
211    final WALFactory wals = new WALFactory(conf, logName);
212
213    try {
214      // put some entries in an WAL
215      TableName tableName = TableName.valueOf(this.getClass().getName());
216      RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build();
217      WAL log = wals.getWAL(regionInfo);
218      MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1);
219
220      int total = 20;
221      for (int i = 0; i < total; i++) {
222        WALEdit kvs = new WALEdit();
223        WALEditInternalHelper.addExtendedCell(kvs,
224          new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName()));
225        NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
226        scopes.put(Bytes.toBytes("column"), 0);
227        log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName,
228          EnvironmentEdgeManager.currentTime(), mvcc, scopes), kvs);
229      }
230      // Send the data to HDFS datanodes and close the HDFS writer
231      log.sync();
232      closeWriter((AbstractFSWAL<?>) log);
233
234      // code taken from MasterFileSystem.getLogDirs(), which is called from
235      // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process)
236      // rename the directory so a rogue RS doesn't create more WALs
237      Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
238      if (!fs.rename(thisTestsDir, rsSplitDir)) {
239        throw new IOException("Failed fs.rename for log split: " + thisTestsDir);
240      }
241      LOG.debug("Renamed region directory: " + rsSplitDir);
242
243      LOG.debug("Processing the old log files.");
244      WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals);
245
246      LOG.debug("Trying to roll the WAL.");
247      IOException error = assertThrows(IOException.class, () -> log.rollWriter());
248      if (error instanceof RemoteException) {
249        error = ((RemoteException) error).unwrapRemoteException();
250      }
251      assertTrue(
252        error instanceof FileNotFoundException || error.getCause() instanceof FileNotFoundException,
253        "unexpected error: " + Throwables.getStackTraceAsString(error));
254    } finally {
255      wals.close();
256      if (fs.exists(thisTestsDir)) {
257        fs.delete(thisTestsDir, true);
258      }
259    }
260  }
261
262  private <W extends WALProvider.WriterBase> void closeWriter(AbstractFSWAL<W> wal) {
263    wal.waitForSafePoint();
264    long oldFileLen = wal.writer.getLength();
265    wal.closeWriter(wal.writer, wal.getOldPath());
266    wal.logRollAndSetupWalProps(wal.getOldPath(), null, oldFileLen);
267    wal.writer = null;
268    wal.onWriterReplaced(null);
269    wal.rollRequested.set(false);
270  }
271}