001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver.wal; 019 020import static org.junit.jupiter.api.Assertions.assertThrows; 021import static org.junit.jupiter.api.Assertions.assertTrue; 022 023import java.io.FileNotFoundException; 024import java.io.IOException; 025import java.util.NavigableMap; 026import java.util.TreeMap; 027import java.util.stream.Stream; 028import org.apache.hadoop.conf.Configuration; 029import org.apache.hadoop.fs.FileSystem; 030import org.apache.hadoop.fs.Path; 031import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate; 032import org.apache.hadoop.hbase.HBaseTestingUtil; 033import org.apache.hadoop.hbase.HConstants; 034import org.apache.hadoop.hbase.KeyValue; 035import org.apache.hadoop.hbase.ServerName; 036import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 037import org.apache.hadoop.hbase.TableName; 038import org.apache.hadoop.hbase.client.Admin; 039import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 040import org.apache.hadoop.hbase.client.Put; 041import org.apache.hadoop.hbase.client.RegionInfo; 042import org.apache.hadoop.hbase.client.RegionInfoBuilder; 043import org.apache.hadoop.hbase.client.Table; 044import org.apache.hadoop.hbase.client.TableDescriptor; 045import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 046import org.apache.hadoop.hbase.log.HBaseMarkers; 047import org.apache.hadoop.hbase.regionserver.HRegionServer; 048import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl; 049import org.apache.hadoop.hbase.testclassification.MediumTests; 050import org.apache.hadoop.hbase.testclassification.RegionServerTests; 051import org.apache.hadoop.hbase.util.Bytes; 052import org.apache.hadoop.hbase.util.CommonFSUtils; 053import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 054import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; 055import org.apache.hadoop.hbase.wal.WAL; 056import org.apache.hadoop.hbase.wal.WALEdit; 057import org.apache.hadoop.hbase.wal.WALEditInternalHelper; 058import org.apache.hadoop.hbase.wal.WALFactory; 059import org.apache.hadoop.hbase.wal.WALKeyImpl; 060import org.apache.hadoop.hbase.wal.WALProvider; 061import org.apache.hadoop.hbase.wal.WALSplitter; 062import org.apache.hadoop.hdfs.MiniDFSCluster; 063import org.apache.hadoop.ipc.RemoteException; 064import org.junit.jupiter.api.AfterEach; 065import org.junit.jupiter.api.BeforeAll; 066import org.junit.jupiter.api.BeforeEach; 067import org.junit.jupiter.api.Tag; 068import org.junit.jupiter.api.TestTemplate; 069import org.junit.jupiter.params.provider.Arguments; 070import org.slf4j.Logger; 071import org.slf4j.LoggerFactory; 072 073import org.apache.hbase.thirdparty.com.google.common.base.Throwables; 074 075/** 076 * Tests for conditions that should trigger RegionServer aborts when rolling the current WAL fails. 077 */ 078@Tag(RegionServerTests.TAG) 079@Tag(MediumTests.TAG) 080@HBaseParameterizedTestTemplate(name = "{index}: walProvider={0}") 081public class TestLogRollAbort { 082 083 private static final Logger LOG = LoggerFactory.getLogger(TestLogRollAbort.class); 084 private static MiniDFSCluster dfsCluster; 085 private static Admin admin; 086 private static SingleProcessHBaseCluster cluster; 087 protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 088 089 /* For the split-then-roll test */ 090 private static final Path HBASEDIR = new Path("/hbase"); 091 private static final Path HBASELOGDIR = new Path("/hbaselog"); 092 private static final Path OLDLOGDIR = new Path(HBASELOGDIR, HConstants.HREGION_OLDLOGDIR_NAME); 093 094 // Need to override this setup so we can edit the config before it gets sent 095 // to the HDFS & HBase cluster startup. 096 @BeforeAll 097 public static void setUpBeforeClass() throws Exception { 098 // Tweak default timeout values down for faster recovery 099 TEST_UTIL.getConfiguration().setInt("hbase.regionserver.logroll.errors.tolerated", 2); 100 TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 10 * 1000); 101 102 // Increase the amount of time between client retries 103 TEST_UTIL.getConfiguration().setLong("hbase.client.pause", 5 * 1000); 104 105 // lower the namenode & datanode heartbeat so the namenode 106 // quickly detects datanode failures 107 TEST_UTIL.getConfiguration().setInt("dfs.namenode.heartbeat.recheck-interval", 5000); 108 TEST_UTIL.getConfiguration().setInt("dfs.heartbeat.interval", 1); 109 // the namenode might still try to choose the recently-dead datanode 110 // for a pipeline, so try to a new pipeline multiple times 111 TEST_UTIL.getConfiguration().setInt("dfs.client.block.write.retries", 10); 112 TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, "asyncfs"); 113 } 114 115 public static Stream<Arguments> parameters() { 116 return Stream.of(Arguments.of("filesystem"), Arguments.of("asyncfs")); 117 } 118 119 private Configuration conf; 120 private FileSystem fs; 121 122 private final String walProvider; 123 124 public TestLogRollAbort(String walProvider) { 125 this.walProvider = walProvider; 126 } 127 128 @BeforeEach 129 public void setUp() throws Exception { 130 TEST_UTIL.getConfiguration().set(WALFactory.WAL_PROVIDER, walProvider); 131 TEST_UTIL.startMiniCluster(2); 132 133 cluster = TEST_UTIL.getHBaseCluster(); 134 dfsCluster = TEST_UTIL.getDFSCluster(); 135 admin = TEST_UTIL.getAdmin(); 136 conf = TEST_UTIL.getConfiguration(); 137 fs = TEST_UTIL.getDFSCluster().getFileSystem(); 138 139 // disable region rebalancing (interferes with log watching) 140 cluster.getMaster().balanceSwitch(false); 141 CommonFSUtils.setRootDir(conf, HBASEDIR); 142 CommonFSUtils.setWALRootDir(conf, HBASELOGDIR); 143 } 144 145 @AfterEach 146 public void tearDown() throws Exception { 147 TEST_UTIL.shutdownMiniCluster(); 148 } 149 150 /** 151 * Tests that RegionServer aborts if we hit an error closing the WAL when there are unsynced WAL 152 * edits. See HBASE-4282. 153 */ 154 @TestTemplate 155 public void testRSAbortWithUnflushedEdits() throws Exception { 156 LOG.info("Starting testRSAbortWithUnflushedEdits()"); 157 158 // When the hbase:meta table can be opened, the region servers are running 159 TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME).close(); 160 161 // Create the test table and open it 162 TableName tableName = TableName.valueOf(this.getClass().getSimpleName()); 163 TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName) 164 .setColumnFamily(ColumnFamilyDescriptorBuilder.of(HConstants.CATALOG_FAMILY)).build(); 165 166 admin.createTable(desc); 167 Table table = TEST_UTIL.getConnection().getTable(tableName); 168 try { 169 HRegionServer server = TEST_UTIL.getRSForFirstRegionInTable(tableName); 170 WAL log = server.getWAL(null); 171 172 Put p = new Put(Bytes.toBytes("row2001")); 173 p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2001)); 174 table.put(p); 175 176 log.sync(); 177 178 p = new Put(Bytes.toBytes("row2002")); 179 p.addColumn(HConstants.CATALOG_FAMILY, Bytes.toBytes("col"), Bytes.toBytes(2002)); 180 table.put(p); 181 182 dfsCluster.restartDataNodes(); 183 LOG.info("Restarted datanodes"); 184 185 try { 186 log.rollWriter(true); 187 } catch (FailedLogCloseException flce) { 188 // Expected exception. We used to expect that there would be unsynced appends but this 189 // not reliable now that sync plays a roll in wall rolling. The above puts also now call 190 // sync. 191 } catch (Throwable t) { 192 LOG.error(HBaseMarkers.FATAL, "FAILED TEST: Got wrong exception", t); 193 } 194 } finally { 195 table.close(); 196 } 197 } 198 199 /** 200 * Tests the case where a RegionServer enters a GC pause, comes back online after the master 201 * declared it dead and started to split. Want log rolling after a master split to fail. See 202 * HBASE-2312. 203 */ 204 @TestTemplate 205 public void testLogRollAfterSplitStart() throws IOException { 206 LOG.info("Verify wal roll after split starts will fail."); 207 String logName = 208 ServerName.valueOf("testLogRollAfterSplitStart", 16010, EnvironmentEdgeManager.currentTime()) 209 .toString(); 210 Path thisTestsDir = new Path(HBASELOGDIR, AbstractFSWALProvider.getWALDirectoryName(logName)); 211 final WALFactory wals = new WALFactory(conf, logName); 212 213 try { 214 // put some entries in an WAL 215 TableName tableName = TableName.valueOf(this.getClass().getName()); 216 RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).build(); 217 WAL log = wals.getWAL(regionInfo); 218 MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(1); 219 220 int total = 20; 221 for (int i = 0; i < total; i++) { 222 WALEdit kvs = new WALEdit(); 223 WALEditInternalHelper.addExtendedCell(kvs, 224 new KeyValue(Bytes.toBytes(i), tableName.getName(), tableName.getName())); 225 NavigableMap<byte[], Integer> scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR); 226 scopes.put(Bytes.toBytes("column"), 0); 227 log.appendData(regionInfo, new WALKeyImpl(regionInfo.getEncodedNameAsBytes(), tableName, 228 EnvironmentEdgeManager.currentTime(), mvcc, scopes), kvs); 229 } 230 // Send the data to HDFS datanodes and close the HDFS writer 231 log.sync(); 232 closeWriter((AbstractFSWAL<?>) log); 233 234 // code taken from MasterFileSystem.getLogDirs(), which is called from 235 // MasterFileSystem.splitLog() handles RS shutdowns (as observed by the splitting process) 236 // rename the directory so a rogue RS doesn't create more WALs 237 Path rsSplitDir = thisTestsDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 238 if (!fs.rename(thisTestsDir, rsSplitDir)) { 239 throw new IOException("Failed fs.rename for log split: " + thisTestsDir); 240 } 241 LOG.debug("Renamed region directory: " + rsSplitDir); 242 243 LOG.debug("Processing the old log files."); 244 WALSplitter.split(HBASELOGDIR, rsSplitDir, OLDLOGDIR, fs, conf, wals); 245 246 LOG.debug("Trying to roll the WAL."); 247 IOException error = assertThrows(IOException.class, () -> log.rollWriter()); 248 if (error instanceof RemoteException) { 249 error = ((RemoteException) error).unwrapRemoteException(); 250 } 251 assertTrue( 252 error instanceof FileNotFoundException || error.getCause() instanceof FileNotFoundException, 253 "unexpected error: " + Throwables.getStackTraceAsString(error)); 254 } finally { 255 wals.close(); 256 if (fs.exists(thisTestsDir)) { 257 fs.delete(thisTestsDir, true); 258 } 259 } 260 } 261 262 private <W extends WALProvider.WriterBase> void closeWriter(AbstractFSWAL<W> wal) { 263 wal.waitForSafePoint(); 264 long oldFileLen = wal.writer.getLength(); 265 wal.closeWriter(wal.writer, wal.getOldPath()); 266 wal.logRollAndSetupWalProps(wal.getOldPath(), null, oldFileLen); 267 wal.writer = null; 268 wal.onWriterReplaced(null); 269 wal.rollRequested.set(false); 270 } 271}