001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver.wal; 019 020import static org.junit.Assert.assertThrows; 021import static org.junit.Assert.assertTrue; 022 023import java.io.FileNotFoundException; 024import java.io.IOException; 025import java.util.Arrays; 026import java.util.List; 027import java.util.NavigableMap; 028import java.util.TreeMap; 029import org.apache.hadoop.conf.Configuration; 030import org.apache.hadoop.fs.FileSystem; 031import org.apache.hadoop.fs.Path; 032import org.apache.hadoop.hbase.HBaseClassTestRule; 033import org.apache.hadoop.hbase.HBaseTestingUtil; 034import org.apache.hadoop.hbase.HConstants; 035import org.apache.hadoop.hbase.KeyValue; 036import org.apache.hadoop.hbase.ServerName; 037import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 038import org.apache.hadoop.hbase.TableName; 039import org.apache.hadoop.hbase.client.Admin; 040import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 041import org.apache.hadoop.hbase.client.Put; 042import org.apache.hadoop.hbase.client.RegionInfo; 043import org.apache.hadoop.hbase.client.RegionInfoBuilder; 044import org.apache.hadoop.hbase.client.Table; 045import org.apache.hadoop.hbase.client.TableDescriptor; 046import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 047import org.apache.hadoop.hbase.log.HBaseMarkers; 048import org.apache.hadoop.hbase.regionserver.HRegionServer; 049import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl; 050import org.apache.hadoop.hbase.testclassification.MediumTests; 051import org.apache.hadoop.hbase.testclassification.RegionServerTests; 052import org.apache.hadoop.hbase.util.Bytes; 053import org.apache.hadoop.hbase.util.CommonFSUtils; 054import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 055import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; 056import org.apache.hadoop.hbase.wal.WAL; 057import org.apache.hadoop.hbase.wal.WALEdit; 058import org.apache.hadoop.hbase.wal.WALFactory; 059import org.apache.hadoop.hbase.wal.WALKeyImpl; 060import org.apache.hadoop.hbase.wal.WALProvider; 061import org.apache.hadoop.hbase.wal.WALSplitter; 062import org.apache.hadoop.hdfs.MiniDFSCluster; 063import org.apache.hadoop.ipc.RemoteException; 064import org.junit.After; 065import org.junit.Before; 066import org.junit.BeforeClass; 067import org.junit.ClassRule; 068import org.junit.Test; 069import org.junit.experimental.categories.Category; 070import org.junit.runner.RunWith; 071import org.junit.runners.Parameterized; 072import org.junit.runners.Parameterized.Parameter; 073import org.junit.runners.Parameterized.Parameters; 074import org.slf4j.Logger; 075import org.slf4j.LoggerFactory; 076 077import org.apache.hbase.thirdparty.com.google.common.base.Throwables; 078 079/** 080 * Tests for conditions that should trigger RegionServer aborts when rolling the current WAL fails. 081 */ 082@RunWith(Parameterized.class) 083@Category({ RegionServerTests.class, MediumTests.class }) 084public class TestLogRollAbort { 085 086 @ClassRule 087 public static final HBaseClassTestRule CLASS_RULE = 088 HBaseClassTestRule.forClass(TestLogRollAbort.class); 089 090 private static final Logger LOG = LoggerFactory.getLogger(AbstractTestLogRolling.class); 091 private static MiniDFSCluster dfsCluster; 092 private static Admin admin; 093 private static SingleProcessHBaseCluster cluster; 094 protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 095 096 /* For the split-then-roll test */ 097 private static final Path HBASEDIR = new Path("/hbase"); 098 private static final Path HBASELOGDIR = new Path("/hbaselog"); 099 private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME); 100 101 // Need to override this setup so we can edit the config before it gets sent 102 // to the HDFS & HBase cluster startup. 103 @BeforeClass 104 public static void setUpBeforeClass() throws Exception { 105 // Tweak default timeout values down for faster recovery 106 TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2); 107 TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000); 108 109 // Increase the amount of time between client retries 110 TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000); 111 112 // lower the namenode & datanode heartbeat so the namenode 113 // quickly detects datanode failures 114 TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000); 115 TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1); 116 // the namenode might still try to choose the recently-dead datanode 117 // for a pipeline, so try to a new pipeline multiple times 118 TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10); 119 TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "asyncfs"); 120 } 121 122 @Parameters(name = "{index}: walProvider={0}") 123 public static List<Object[]> params() { 124 return Arrays.asList(new Object[] { "filesystem" }, new Object[] { "asyncfs" }); 125 } 126 127 private Configuration conf; 128 private FileSystem fs; 129 130 @Parameter 131 public String walProvider; 132 133 @Before 134 public void setUp() throws Exception { 135 TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, walProvider); 136 TEST_UTIL.startMiniCluster(2); 137 138 cluster = TEST_UTIL.getHBaseCluster(); 139 dfsCluster = TEST_UTIL.getDFSCluster(); 140 admin = TEST_UTIL.getAdmin(); 141 conf = TEST_UTIL.getConfiguration(); 142 fs = TEST_UTIL.getDFSCluster().getFileSystem(); 143 144 // disable region rebalancing (interferes with log watching) 145 cluster.getMaster().balanceSwitch(false); 146 CommonFSUtils.setRootDir(conf, HBASEDIR); 147 CommonFSUtils.setWALRootDir(conf, HBASELOGDIR); 148 } 149 150 @After 151 public void tearDown() throws Exception { 152 TEST_UTIL.shutdownMiniCluster(); 153 } 154 155 /** 156 * Tests that RegionServer aborts if we hit an error closing the WAL when there are unsynced WAL 157 * edits. See HBASE-4282. 158 */ 159 @Test 160 public void testRSAbortWithUnflushedEdits() throws Exception { 161 LOG.info("Starting testRSAbortWithUnflushedEdits()"); 162 163 // When the hbase:meta table can be opened, the region servers are running 164 TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close(); 165 166 // Create the test table and open it 167 TableName tableName = TableName.valueOf(this.getClass().getSimpleName()); 168 TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName) 169 .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build(); 170 171 admin.createTable(desc); 172 Table table = TEST_UTIL.getConnection().getTable(tableName); 173 try { 174 HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName); 175 WAL log = server.getWAL(null); 176 177 Put p = new Put(Bytes.toBytes("row2001")); 178 p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001)); 179 table.put(p); 180 181 log.sync(); 182 183 p = new Put(Bytes.toBytes("row2002")); 184 p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002)); 185 table.put(p); 186 187 dfsCluster.restartDataNodes(); 188 LOG.info("Restarted datanodes"); 189 190 try { 191 log.rollWriter(true); 192 } catch (FailedLogCloseException flce) { 193 // Expected exception. We used to expect that there would be unsynced appends but this 194 // not reliable now that sync plays a roll in wall rolling. The above puts also now call 195 // sync. 196 } catch (Throwable t) { 197 LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t); 198 } 199 } finally { 200 table.close(); 201 } 202 } 203 204 /** 205 * Tests the case where a RegionServer enters a GC pause, comes back online after the master 206 * declared it dead and started to split. Want log rolling after a master split to fail. See 207 * HBASE-2312. 208 */ 209 @Test 210 public void testLogRollAfterSplitStart() throws IOException { 211 LOG.info("Verify wal roll after split starts will fail."); 212 String logName = 213 ServerName.valueOf("testLogRollAfterSplitStart", 16010, EnvironmentEdgeManager.currentTime()) 214 .toString(); 215 Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName)); 216 final WALFactory wals = new WALFactory(conf, logName); 217 218 try { 219 // put some entries in an WAL 220 TableName tableName = TableName.valueOf(this.getClass().getName()); 221 RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build(); 222 WAL log = wals.getWAL(regionInfo); 223 MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1); 224 225 int total = 20; 226 for (int i = 0; i < total; i++) { 227 WALEdit kvs = new WALEdit(); 228 kvs.add(new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName())); 229 NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR); 230 scopes.put(Bytes.toBytes("column"), 0); 231 log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName, 232 EnvironmentEdgeManager.currentTime(), mvcc, scopes), kvs); 233 } 234 // Send the data to HDFS datanodes and close the HDFS writer 235 log.sync(); 236 closeWriter((AbstractFSWAL<?>) log); 237 238 // code taken from MasterFileSystem.getLogDirs(), which is called from 239 // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process) 240 // rename the directory so a rogue RS doesn't create more WALs 241 Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 242 if (!fs.rename(thisTestsDir, rsSplitDir)) { 243 throw new IOException("Failed fs.rename for log split: " + thisTestsDir); 244 } 245 LOG.debug("Renamed region directory: " + rsSplitDir); 246 247 LOG.debug("Processing the old log files."); 248 WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals); 249 250 LOG.debug("Trying to roll the WAL."); 251 IOException error = assertThrows(IOException.class, () -> log.rollWriter()); 252 if (error instanceof RemoteException) { 253 error = ((RemoteException) error).unwrapRemoteException(); 254 } 255 assertTrue("unexpected error: " + Throwables.getStackTraceAsString(error), 256 error instanceof FileNotFoundException 257 || error.getCause() instanceof FileNotFoundException); 258 } finally { 259 wals.close(); 260 if (fs.exists(thisTestsDir)) { 261 fs.delete(thisTestsDir, true); 262 } 263 } 264 } 265 266 private <W extends WALProvider.WriterBase> void closeWriter(AbstractFSWAL<W> wal) { 267 wal.waitForSafePoint(); 268 long oldFileLen = wal.writer.getLength(); 269 wal.closeWriter(wal.writer, wal.getOldPath()); 270 wal.logRollAndSetupWalProps(wal.getOldPath(), null, oldFileLen); 271 wal.writer = null; 272 wal.onWriterReplaced(null); 273 wal.rollRequested.set(false); 274 } 275}