001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver.wal;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.util.NavigableMap;
023import java.util.TreeMap;
024import org.apache.hadoop.conf.Configuration;
025import org.apache.hadoop.fs.FileSystem;
026import org.apache.hadoop.fs.Path;
027import org.apache.hadoop.hbase.HBaseClassTestRule;
028import org.apache.hadoop.hbase.HBaseTestingUtility;
029import org.apache.hadoop.hbase.HConstants;
030import org.apache.hadoop.hbase.KeyValue;
031import org.apache.hadoop.hbase.MiniHBaseCluster;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.Admin;
035import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
036import org.apache.hadoop.hbase.client.Put;
037import org.apache.hadoop.hbase.client.RegionInfo;
038import org.apache.hadoop.hbase.client.RegionInfoBuilder;
039import org.apache.hadoop.hbase.client.Table;
040import org.apache.hadoop.hbase.client.TableDescriptor;
041import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
042import org.apache.hadoop.hbase.log.HBaseMarkers;
043import org.apache.hadoop.hbase.regionserver.HRegionServer;
044import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
045import org.apache.hadoop.hbase.testclassification.MediumTests;
046import org.apache.hadoop.hbase.testclassification.RegionServerTests;
047import org.apache.hadoop.hbase.util.Bytes;
048import org.apache.hadoop.hbase.util.CommonFSUtils;
049import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
050import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
051import org.apache.hadoop.hbase.wal.WAL;
052import org.apache.hadoop.hbase.wal.WALEdit;
053import org.apache.hadoop.hbase.wal.WALFactory;
054import org.apache.hadoop.hbase.wal.WALKeyImpl;
055import org.apache.hadoop.hbase.wal.WALSplitter;
056import org.apache.hadoop.hdfs.MiniDFSCluster;
057import org.junit.After;
058import org.junit.Assert;
059import org.junit.Before;
060import org.junit.BeforeClass;
061import org.junit.ClassRule;
062import org.junit.Test;
063import org.junit.experimental.categories.Category;
064import org.slf4j.Logger;
065import org.slf4j.LoggerFactory;
066
067/**
068 * Tests for conditions that should trigger RegionServer aborts when rolling the current WAL fails.
069 */
070@Category({ RegionServerTests.class, MediumTests.class })
071public class TestLogRollAbort {
072
073  @ClassRule
074  public static final HBaseClassTestRule CLASS_RULE =
075    HBaseClassTestRule.forClass(TestLogRollAbort.class);
076
077  private static final Logger LOG = LoggerFactory.getLogger(AbstractTestLogRolling.class);
078  private static MiniDFSCluster dfsCluster;
079  private static Admin admin;
080  private static MiniHBaseCluster cluster;
081  protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
082
083  /* For the split-then-roll test */
084  private static final Path HBASEDIR = new Path("/hbase");
085  private static final Path HBASELOGDIR = new Path("/hbaselog");
086  private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME);
087
088  // Need to override this setup so we can edit the config before it gets sent
089  // to the HDFS & HBase cluster startup.
090  @BeforeClass
091  public static void setUpBeforeClass() throws Exception {
092    // Tweak default timeout values down for faster recovery
093    TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2);
094    TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000);
095
096    // Increase the amount of time between client retries
097    TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000);
098
099    // lower the namenode & datanode heartbeat so the namenode
100    // quickly detects datanode failures
101    TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000);
102    TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1);
103    // the namenode might still try to choose the recently-dead datanode
104    // for a pipeline, so try to a new pipeline multiple times
105    TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10);
106    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "filesystem");
107  }
108
109  private Configuration conf;
110  private FileSystem fs;
111
112  @Before
113  public void setUp() throws Exception {
114    TEST_UTIL.startMiniCluster(2);
115
116    cluster = TEST_UTIL.getHBaseCluster();
117    dfsCluster = TEST_UTIL.getDFSCluster();
118    admin = TEST_UTIL.getAdmin();
119    conf = TEST_UTIL.getConfiguration();
120    fs = TEST_UTIL.getDFSCluster().getFileSystem();
121
122    // disable region rebalancing (interferes with log watching)
123    cluster.getMaster().balanceSwitch(false);
124    CommonFSUtils.setRootDir(conf, HBASEDIR);
125    CommonFSUtils.setWALRootDir(conf, HBASELOGDIR);
126  }
127
128  @After
129  public void tearDown() throws Exception {
130    TEST_UTIL.shutdownMiniCluster();
131  }
132
133  /**
134   * Tests that RegionServer aborts if we hit an error closing the WAL when there are unsynced WAL
135   * edits. See HBASE-4282.
136   */
137  @Test
138  public void testRSAbortWithUnflushedEdits() throws Exception {
139    LOG.info("Starting testRSAbortWithUnflushedEdits()");
140
141    // When the hbase:meta table can be opened, the region servers are running
142    TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close();
143
144    // Create the test table and open it
145    TableName tableName = TableName.valueOf(this.getClass().getSimpleName());
146    TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
147      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build();
148
149    admin.createTable(desc);
150    Table table = TEST_UTIL.getConnection().getTable(tableName);
151    try {
152      HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName);
153      WAL log = server.getWAL(null);
154
155      Put p = new Put(Bytes.toBytes("row2001"));
156      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001));
157      table.put(p);
158
159      log.sync();
160
161      p = new Put(Bytes.toBytes("row2002"));
162      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002));
163      table.put(p);
164
165      dfsCluster.restartDataNodes();
166      LOG.info("Restarted datanodes");
167
168      try {
169        log.rollWriter(true);
170      } catch (FailedLogCloseException flce) {
171        // Expected exception. We used to expect that there would be unsynced appends but this
172        // not reliable now that sync plays a roll in wall rolling. The above puts also now call
173        // sync.
174      } catch (Throwable t) {
175        LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t);
176      }
177    } finally {
178      table.close();
179    }
180  }
181
182  /**
183   * Tests the case where a RegionServer enters a GC pause, comes back online after the master
184   * declared it dead and started to split. Want log rolling after a master split to fail. See
185   * HBASE-2312.
186   */
187  @Test
188  public void testLogRollAfterSplitStart() throws IOException {
189    LOG.info("Verify wal roll after split starts will fail.");
190    String logName =
191      ServerName.valueOf("testLogRollAfterSplitStart", 16010, EnvironmentEdgeManager.currentTime())
192        .toString();
193    Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName));
194    final WALFactory wals = new WALFactory(conf, logName);
195
196    try {
197      // put some entries in an WAL
198      TableName tableName = TableName.valueOf(this.getClass().getName());
199      RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build();
200      WAL log = wals.getWAL(regionInfo);
201      MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1);
202
203      int total = 20;
204      for (int i = 0; i < total; i++) {
205        WALEdit kvs = new WALEdit();
206        kvs.add(new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName()));
207        NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
208        scopes.put(Bytes.toBytes("column"), 0);
209        log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName,
210          EnvironmentEdgeManager.currentTime(), mvcc, scopes), kvs);
211      }
212      // Send the data to HDFS datanodes and close the HDFS writer
213      log.sync();
214      ((AbstractFSWAL<?>) log).replaceWriter(((FSHLog) log).getOldPath(), null, null);
215
216      // code taken from MasterFileSystem.getLogDirs(), which is called from
217      // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process)
218      // rename the directory so a rogue RS doesn't create more WALs
219      Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
220      if (!fs.rename(thisTestsDir, rsSplitDir)) {
221        throw new IOException("Failed fs.rename for log split: " + thisTestsDir);
222      }
223      LOG.debug("Renamed region directory: " + rsSplitDir);
224
225      LOG.debug("Processing the old log files.");
226      WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals);
227
228      LOG.debug("Trying to roll the WAL.");
229      try {
230        log.rollWriter();
231        Assert.fail("rollWriter() did not throw any exception.");
232      } catch (IOException ioe) {
233        if (ioe.getCause() instanceof FileNotFoundException) {
234          LOG.info("Got the expected exception: ", ioe.getCause());
235        } else {
236          Assert.fail("Unexpected exception: " + ioe);
237        }
238      }
239    } finally {
240      wals.close();
241      if (fs.exists(thisTestsDir)) {
242        fs.delete(thisTestsDir, true);
243      }
244    }
245  }
246}