001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver.wal;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.util.NavigableMap;
023import java.util.TreeMap;
024import org.apache.hadoop.conf.Configuration;
025import org.apache.hadoop.fs.FileSystem;
026import org.apache.hadoop.fs.Path;
027import org.apache.hadoop.hbase.HBaseClassTestRule;
028import org.apache.hadoop.hbase.HBaseTestingUtility;
029import org.apache.hadoop.hbase.HConstants;
030import org.apache.hadoop.hbase.KeyValue;
031import org.apache.hadoop.hbase.MiniHBaseCluster;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.Admin;
035import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
036import org.apache.hadoop.hbase.client.Put;
037import org.apache.hadoop.hbase.client.RegionInfo;
038import org.apache.hadoop.hbase.client.RegionInfoBuilder;
039import org.apache.hadoop.hbase.client.Table;
040import org.apache.hadoop.hbase.client.TableDescriptor;
041import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
042import org.apache.hadoop.hbase.log.HBaseMarkers;
043import org.apache.hadoop.hbase.regionserver.HRegionServer;
044import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
045import org.apache.hadoop.hbase.testclassification.MediumTests;
046import org.apache.hadoop.hbase.testclassification.RegionServerTests;
047import org.apache.hadoop.hbase.util.Bytes;
048import org.apache.hadoop.hbase.util.CommonFSUtils;
049import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
050import org.apache.hadoop.hbase.wal.WAL;
051import org.apache.hadoop.hbase.wal.WALEdit;
052import org.apache.hadoop.hbase.wal.WALFactory;
053import org.apache.hadoop.hbase.wal.WALKeyImpl;
054import org.apache.hadoop.hbase.wal.WALSplitter;
055import org.apache.hadoop.hdfs.MiniDFSCluster;
056import org.junit.After;
057import org.junit.Assert;
058import org.junit.Before;
059import org.junit.BeforeClass;
060import org.junit.ClassRule;
061import org.junit.Test;
062import org.junit.experimental.categories.Category;
063import org.slf4j.Logger;
064import org.slf4j.LoggerFactory;
065
066/**
067 * Tests for conditions that should trigger RegionServer aborts when
068 * rolling the current WAL fails.
069 */
070@Category({RegionServerTests.class, MediumTests.class})
071public class TestLogRollAbort {
072
073  @ClassRule
074  public static final HBaseClassTestRule CLASS_RULE =
075      HBaseClassTestRule.forClass(TestLogRollAbort.class);
076
077  private static final Logger LOG = LoggerFactory.getLogger(AbstractTestLogRolling.class);
078  private static MiniDFSCluster dfsCluster;
079  private static Admin admin;
080  private static MiniHBaseCluster cluster;
081  protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
082
083  /* For the split-then-roll test */
084  private static final Path HBASEDIR = new Path("/hbase");
085  private static final Path HBASELOGDIR = new Path("/hbaselog");
086  private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME);
087
088  // Need to override this setup so we can edit the config before it gets sent
089  // to the HDFS & HBase cluster startup.
090  @BeforeClass
091  public static void setUpBeforeClass() throws Exception {
092    // Tweak default timeout values down for faster recovery
093    TEST_UTIL.getConfiguration().setInt(
094        "hbase.regionserver.logroll.errors.tolerated", 2);
095    TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000);
096
097    // Increase the amount of time between client retries
098    TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000);
099
100    // lower the namenode & datanode heartbeat so the namenode
101    // quickly detects datanode failures
102    TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000);
103    TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1);
104    // the namenode might still try to choose the recently-dead datanode
105    // for a pipeline, so try to a new pipeline multiple times
106    TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10);
107    TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "filesystem");
108  }
109
110  private Configuration conf;
111  private FileSystem fs;
112
113  @Before
114  public void setUp() throws Exception {
115    TEST_UTIL.startMiniCluster(2);
116
117    cluster = TEST_UTIL.getHBaseCluster();
118    dfsCluster = TEST_UTIL.getDFSCluster();
119    admin = TEST_UTIL.getAdmin();
120    conf = TEST_UTIL.getConfiguration();
121    fs = TEST_UTIL.getDFSCluster().getFileSystem();
122
123    // disable region rebalancing (interferes with log watching)
124    cluster.getMaster().balanceSwitch(false);
125    CommonFSUtils.setRootDir(conf, HBASEDIR);
126    CommonFSUtils.setWALRootDir(conf, HBASELOGDIR);
127  }
128
129  @After
130  public void tearDown() throws Exception {
131    TEST_UTIL.shutdownMiniCluster();
132  }
133
134  /**
135   * Tests that RegionServer aborts if we hit an error closing the WAL when
136   * there are unsynced WAL edits.  See HBASE-4282.
137   */
138  @Test
139  public void testRSAbortWithUnflushedEdits() throws Exception {
140    LOG.info("Starting testRSAbortWithUnflushedEdits()");
141
142    // When the hbase:meta table can be opened, the region servers are running
143    TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close();
144
145    // Create the test table and open it
146    TableName tableName = TableName.valueOf(this.getClass().getSimpleName());
147    TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
148        .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build();
149
150    admin.createTable(desc);
151    Table table = TEST_UTIL.getConnection().getTable(tableName);
152    try {
153      HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName);
154      WAL log = server.getWAL(null);
155
156      Put p = new Put(Bytes.toBytes("row2001"));
157      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001));
158      table.put(p);
159
160      log.sync();
161
162      p = new Put(Bytes.toBytes("row2002"));
163      p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002));
164      table.put(p);
165
166      dfsCluster.restartDataNodes();
167      LOG.info("Restarted datanodes");
168
169      try {
170        log.rollWriter(true);
171      } catch (FailedLogCloseException flce) {
172        // Expected exception.  We used to expect that there would be unsynced appends but this
173        // not reliable now that sync plays a roll in wall rolling.  The above puts also now call
174        // sync.
175      } catch (Throwable t) {
176        LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t);
177      }
178    } finally {
179      table.close();
180    }
181  }
182
183  /**
184   * Tests the case where a RegionServer enters a GC pause,
185   * comes back online after the master declared it dead and started to split.
186   * Want log rolling after a master split to fail. See HBASE-2312.
187   */
188  @Test
189  public void testLogRollAfterSplitStart() throws IOException {
190    LOG.info("Verify wal roll after split starts will fail.");
191    String logName = ServerName.valueOf("testLogRollAfterSplitStart",
192        16010, System.currentTimeMillis()).toString();
193    Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName));
194    final WALFactory wals = new WALFactory(conf, logName);
195
196    try {
197      // put some entries in an WAL
198      TableName tableName =
199          TableName.valueOf(this.getClass().getName());
200      RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build();
201      WAL log = wals.getWAL(regionInfo);
202      MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1);
203
204      int total = 20;
205      for (int i = 0; i < total; i++) {
206        WALEdit kvs = new WALEdit();
207        kvs.add(new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName()));
208        NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
209        scopes.put(Bytes.toBytes("column"), 0);
210        log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName,
211          System.currentTimeMillis(), mvcc, scopes), kvs);
212      }
213      // Send the data to HDFS datanodes and close the HDFS writer
214      log.sync();
215      ((AbstractFSWAL<?>) log).replaceWriter(((FSHLog)log).getOldPath(), null, null);
216
217      // code taken from MasterFileSystem.getLogDirs(), which is called from
218      // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process)
219      // rename the directory so a rogue RS doesn't create more WALs
220      Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
221      if (!fs.rename(thisTestsDir, rsSplitDir)) {
222        throw new IOException("Failed fs.rename for log split: " + thisTestsDir);
223      }
224      LOG.debug("Renamed region directory: " + rsSplitDir);
225
226      LOG.debug("Processing the old log files.");
227      WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals);
228
229      LOG.debug("Trying to roll the WAL.");
230      try {
231        log.rollWriter();
232        Assert.fail("rollWriter() did not throw any exception.");
233      } catch (IOException ioe) {
234        if (ioe.getCause() instanceof FileNotFoundException) {
235          LOG.info("Got the expected exception: ", ioe.getCause());
236        } else {
237          Assert.fail("Unexpected exception: " + ioe);
238        }
239      }
240    } finally {
241      wals.close();
242      if (fs.exists(thisTestsDir)) {
243        fs.delete(thisTestsDir, true);
244      }
245    }
246  }
247}