001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.junit.jupiter.api.Assertions.assertTrue; 021import static org.junit.jupiter.api.Assertions.fail; 022import static org.mockito.ArgumentMatchers.any; 023import static org.mockito.ArgumentMatchers.anyString; 024import static org.mockito.Mockito.atLeast; 025import static org.mockito.Mockito.mock; 026import static org.mockito.Mockito.verify; 027import static org.mockito.Mockito.when; 028 029import java.io.IOException; 030import java.util.List; 031import java.util.Map; 032import java.util.concurrent.atomic.AtomicLong; 033import org.apache.hadoop.conf.Configuration; 034import org.apache.hadoop.fs.FileSystem; 035import org.apache.hadoop.fs.Path; 036import org.apache.hadoop.hbase.DroppedSnapshotException; 037import org.apache.hadoop.hbase.HBaseTestingUtil; 038import org.apache.hadoop.hbase.HConstants; 039import org.apache.hadoop.hbase.Server; 040import org.apache.hadoop.hbase.TableName; 041import org.apache.hadoop.hbase.client.Durability; 042import org.apache.hadoop.hbase.client.Put; 043import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL; 044import org.apache.hadoop.hbase.regionserver.wal.FSHLog; 045import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException; 046import org.apache.hadoop.hbase.testclassification.SmallTests; 047import org.apache.hadoop.hbase.util.Bytes; 048import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper; 049import org.apache.hadoop.hbase.util.Pair; 050import org.apache.hadoop.hbase.util.Threads; 051import org.apache.hadoop.hbase.wal.WAL; 052import org.apache.hadoop.hbase.wal.WALProvider.Writer; 053import org.junit.jupiter.api.AfterEach; 054import org.junit.jupiter.api.BeforeEach; 055import org.junit.jupiter.api.Tag; 056import org.junit.jupiter.api.Test; 057import org.junit.jupiter.api.TestInfo; 058import org.mockito.exceptions.verification.WantedButNotInvoked; 059import org.slf4j.Logger; 060import org.slf4j.LoggerFactory; 061 062/** 063 * Testing sync/append failures. Copied from TestHRegion. 064 */ 065@Tag(SmallTests.TAG) 066public class TestFailedAppendAndSync { 067 068 private static final Logger LOG = LoggerFactory.getLogger(TestFailedAppendAndSync.class); 069 private String name; 070 071 private static final String COLUMN_FAMILY = "MyCF"; 072 private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY); 073 074 HRegion region = null; 075 // Do not run unit tests in parallel (? Why not? It don't work? Why not? St.Ack) 076 private static HBaseTestingUtil TEST_UTIL; 077 public static Configuration CONF; 078 private String dir; 079 080 // Test names 081 protected TableName tableName; 082 083 @BeforeEach 084 public void setup(TestInfo testInfo) throws IOException { 085 this.name = testInfo.getTestMethod().get().getName(); 086 TEST_UTIL = new HBaseTestingUtil(); 087 CONF = TEST_UTIL.getConfiguration(); 088 // Disable block cache. 089 CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f); 090 CONF.setLong(AbstractFSWAL.WAL_SYNC_TIMEOUT_MS, 10000); 091 dir = TEST_UTIL.getDataTestDir("TestHRegion").toString(); 092 tableName = TableName.valueOf(name); 093 } 094 095 @AfterEach 096 public void tearDown() throws Exception { 097 EnvironmentEdgeManagerTestHelper.reset(); 098 LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir()); 099 TEST_UTIL.cleanupTestDir(); 100 } 101 102 String getName() { 103 return name; 104 } 105 106 // Dodgy WAL. Will throw exceptions when flags set. 107 class DodgyFSLog extends FSHLog { 108 volatile boolean throwSyncException = false; 109 volatile boolean throwAppendException = false; 110 volatile boolean throwArchiveException = false; 111 112 final AtomicLong rolls = new AtomicLong(0); 113 114 public DodgyFSLog(FileSystem fs, Server server, Path root, String logDir, Configuration conf) 115 throws IOException { 116 super(fs, server, root, logDir, conf); 117 } 118 119 @Override 120 public Map<byte[], List<byte[]>> rollWriter(boolean force) 121 throws FailedLogCloseException, IOException { 122 Map<byte[], List<byte[]>> regions = super.rollWriter(force); 123 rolls.getAndIncrement(); 124 return regions; 125 } 126 127 @Override 128 protected void archiveLogFile(Path p) throws IOException { 129 if (throwArchiveException) { 130 throw new IOException("throw archival exception"); 131 } 132 } 133 134 @Override 135 protected void archive(Pair<Path, Long> localLogsToArchive) { 136 super.archive(localLogsToArchive); 137 } 138 139 @Override 140 protected Writer createWriterInstance(FileSystem fs, Path path) throws IOException { 141 final Writer w = super.createWriterInstance(fs, path); 142 return new Writer() { 143 @Override 144 public void close() throws IOException { 145 w.close(); 146 } 147 148 @Override 149 public void sync(boolean forceSync) throws IOException { 150 if (throwSyncException) { 151 throw new IOException("FAKE! Failed to replace a bad datanode..."); 152 } 153 w.sync(forceSync); 154 } 155 156 @Override 157 public void append(Entry entry) throws IOException { 158 if (throwAppendException) { 159 throw new IOException("FAKE! Failed to replace a bad datanode..."); 160 } 161 w.append(entry); 162 } 163 164 @Override 165 public long getLength() { 166 return w.getLength(); 167 } 168 169 @Override 170 public long getSyncedLength() { 171 return w.getSyncedLength(); 172 } 173 }; 174 } 175 } 176 177 /** 178 * Reproduce locking up that happens when we get an exceptions appending and syncing. See 179 * HBASE-14317. First I need to set up some mocks for Server and RegionServerServices. I also need 180 * to set up a dodgy WAL that will throw an exception when we go to append to it. 181 */ 182 @Test 183 public void testLockupAroundBadAssignSync() throws IOException { 184 // Make up mocked server and services. 185 RegionServerServices services = mock(RegionServerServices.class); 186 when(services.getConfiguration()).thenReturn(CONF); 187 when(services.isStopped()).thenReturn(false); 188 when(services.isAborted()).thenReturn(false); 189 // OK. Now I have my mocked up Server and RegionServerServices and my dodgy WAL, go ahead with 190 // the test. 191 FileSystem fs = FileSystem.get(CONF); 192 Path rootDir = new Path(dir + getName()); 193 fs.mkdirs(new Path(rootDir, getName())); 194 DodgyFSLog dodgyWAL = new DodgyFSLog(fs, (Server) services, rootDir, getName(), CONF); 195 dodgyWAL.init(); 196 LogRoller logRoller = new LogRoller(services); 197 logRoller.addWAL(dodgyWAL); 198 logRoller.start(); 199 200 boolean threwOnSync = false; 201 boolean threwOnAppend = false; 202 boolean threwOnBoth = false; 203 204 HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL); 205 try { 206 // Get some random bytes. 207 byte[] value = Bytes.toBytes(getName()); 208 try { 209 // First get something into memstore 210 Put put = new Put(value); 211 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), value); 212 region.put(put); 213 } catch (IOException ioe) { 214 fail(); 215 } 216 long rollsCount = dodgyWAL.rolls.get(); 217 try { 218 dodgyWAL.throwAppendException = true; 219 dodgyWAL.throwSyncException = false; 220 Put put = new Put(value); 221 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("3"), value); 222 region.put(put); 223 } catch (IOException ioe) { 224 threwOnAppend = true; 225 } 226 while (rollsCount == dodgyWAL.rolls.get()) { 227 Threads.sleep(100); 228 } 229 rollsCount = dodgyWAL.rolls.get(); 230 231 // When we get to here.. we should be ok. A new WAL has been put in place. There were no 232 // appends to sync. We should be able to continue. 233 234 try { 235 dodgyWAL.throwAppendException = true; 236 dodgyWAL.throwSyncException = true; 237 Put put = new Put(value); 238 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("4"), value); 239 region.put(put); 240 } catch (IOException ioe) { 241 threwOnBoth = true; 242 } 243 while (rollsCount == dodgyWAL.rolls.get()) { 244 Threads.sleep(100); 245 } 246 247 // Again, all should be good. New WAL and no outstanding unsync'd edits so we should be able 248 // to just continue. 249 250 // So, should be no abort at this stage. Verify. 251 verify(services, atLeast(0)).abort(anyString(), any(Throwable.class)); 252 try { 253 dodgyWAL.throwAppendException = false; 254 dodgyWAL.throwSyncException = true; 255 Put put = new Put(value); 256 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("2"), value); 257 region.rsServices = services; 258 region.put(put); 259 } catch (IOException ioe) { 260 threwOnSync = true; 261 } 262 263 region.rsServices = null; 264 // An append in the WAL but the sync failed is a server abort condition. That is our 265 // current semantic. Verify. 266 verify(services, atLeast(1)).abort(anyString(), any()); 267 try { 268 dodgyWAL.throwAppendException = false; 269 dodgyWAL.throwSyncException = false; 270 dodgyWAL.throwArchiveException = true; 271 Pair<Path, Long> pair = new Pair<Path, Long>(); 272 pair.setFirst(new Path("/a/b/")); 273 pair.setSecond(100L); 274 dodgyWAL.archive(pair); 275 } catch (Throwable ioe) { 276 } 277 while (true) { 278 try { 279 // one more abort needs to be called 280 verify(services, atLeast(2)).abort(anyString(), any()); 281 break; 282 } catch (WantedButNotInvoked t) { 283 Threads.sleep(1); 284 } 285 } 286 } finally { 287 // To stop logRoller, its server has to say it is stopped. 288 when(services.isStopped()).thenReturn(true); 289 if (logRoller != null) logRoller.close(); 290 if (region != null) { 291 try { 292 region.close(true); 293 } catch (DroppedSnapshotException e) { 294 LOG.info("On way out; expected!", e); 295 } 296 } 297 if (dodgyWAL != null) dodgyWAL.close(); 298 assertTrue(threwOnBoth, "The regionserver should have thrown an exception"); 299 assertTrue(threwOnAppend, "The regionserver should have thrown an exception"); 300 assertTrue(threwOnSync, "The regionserver should have thrown an exception"); 301 } 302 } 303 304 /** 305 * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)} 306 * when done. 307 */ 308 public static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, 309 Configuration conf, WAL wal) throws IOException { 310 ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null, 311 MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT); 312 return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false, 313 Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES); 314 } 315}