001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.jupiter.api.Assertions.assertTrue;
021import static org.junit.jupiter.api.Assertions.fail;
022import static org.mockito.ArgumentMatchers.any;
023import static org.mockito.ArgumentMatchers.anyString;
024import static org.mockito.Mockito.atLeast;
025import static org.mockito.Mockito.mock;
026import static org.mockito.Mockito.verify;
027import static org.mockito.Mockito.when;
028
029import java.io.IOException;
030import java.util.List;
031import java.util.Map;
032import java.util.concurrent.atomic.AtomicLong;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.FileSystem;
035import org.apache.hadoop.fs.Path;
036import org.apache.hadoop.hbase.DroppedSnapshotException;
037import org.apache.hadoop.hbase.HBaseTestingUtil;
038import org.apache.hadoop.hbase.HConstants;
039import org.apache.hadoop.hbase.Server;
040import org.apache.hadoop.hbase.TableName;
041import org.apache.hadoop.hbase.client.Durability;
042import org.apache.hadoop.hbase.client.Put;
043import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
044import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
045import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
046import org.apache.hadoop.hbase.testclassification.SmallTests;
047import org.apache.hadoop.hbase.util.Bytes;
048import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
049import org.apache.hadoop.hbase.util.Pair;
050import org.apache.hadoop.hbase.util.Threads;
051import org.apache.hadoop.hbase.wal.WAL;
052import org.apache.hadoop.hbase.wal.WALProvider.Writer;
053import org.junit.jupiter.api.AfterEach;
054import org.junit.jupiter.api.BeforeEach;
055import org.junit.jupiter.api.Tag;
056import org.junit.jupiter.api.Test;
057import org.junit.jupiter.api.TestInfo;
058import org.mockito.exceptions.verification.WantedButNotInvoked;
059import org.slf4j.Logger;
060import org.slf4j.LoggerFactory;
061
062/**
063 * Testing sync/append failures. Copied from TestHRegion.
064 */
065@Tag(SmallTests.TAG)
066public class TestFailedAppendAndSync {
067
068  private static final Logger LOG = LoggerFactory.getLogger(TestFailedAppendAndSync.class);
069  private String name;
070
071  private static final String COLUMN_FAMILY = "MyCF";
072  private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
073
074  HRegion region = null;
075  // Do not run unit tests in parallel (? Why not? It don't work? Why not? St.Ack)
076  private static HBaseTestingUtil TEST_UTIL;
077  public static Configuration CONF;
078  private String dir;
079
080  // Test names
081  protected TableName tableName;
082
083  @BeforeEach
084  public void setup(TestInfo testInfo) throws IOException {
085    this.name = testInfo.getTestMethod().get().getName();
086    TEST_UTIL = new HBaseTestingUtil();
087    CONF = TEST_UTIL.getConfiguration();
088    // Disable block cache.
089    CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
090    CONF.setLong(AbstractFSWAL.WAL_SYNC_TIMEOUT_MS, 10000);
091    dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
092    tableName = TableName.valueOf(name);
093  }
094
095  @AfterEach
096  public void tearDown() throws Exception {
097    EnvironmentEdgeManagerTestHelper.reset();
098    LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
099    TEST_UTIL.cleanupTestDir();
100  }
101
102  String getName() {
103    return name;
104  }
105
106  // Dodgy WAL. Will throw exceptions when flags set.
107  class DodgyFSLog extends FSHLog {
108    volatile boolean throwSyncException = false;
109    volatile boolean throwAppendException = false;
110    volatile boolean throwArchiveException = false;
111
112    final AtomicLong rolls = new AtomicLong(0);
113
114    public DodgyFSLog(FileSystem fs, Server server, Path root, String logDir, Configuration conf)
115      throws IOException {
116      super(fs, server, root, logDir, conf);
117    }
118
119    @Override
120    public Map<byte[], List<byte[]>> rollWriter(boolean force)
121      throws FailedLogCloseException, IOException {
122      Map<byte[], List<byte[]>> regions = super.rollWriter(force);
123      rolls.getAndIncrement();
124      return regions;
125    }
126
127    @Override
128    protected void archiveLogFile(Path p) throws IOException {
129      if (throwArchiveException) {
130        throw new IOException("throw archival exception");
131      }
132    }
133
134    @Override
135    protected void archive(Pair<Path, Long> localLogsToArchive) {
136      super.archive(localLogsToArchive);
137    }
138
139    @Override
140    protected Writer createWriterInstance(FileSystem fs, Path path) throws IOException {
141      final Writer w = super.createWriterInstance(fs, path);
142      return new Writer() {
143        @Override
144        public void close() throws IOException {
145          w.close();
146        }
147
148        @Override
149        public void sync(boolean forceSync) throws IOException {
150          if (throwSyncException) {
151            throw new IOException("FAKE! Failed to replace a bad datanode...");
152          }
153          w.sync(forceSync);
154        }
155
156        @Override
157        public void append(Entry entry) throws IOException {
158          if (throwAppendException) {
159            throw new IOException("FAKE! Failed to replace a bad datanode...");
160          }
161          w.append(entry);
162        }
163
164        @Override
165        public long getLength() {
166          return w.getLength();
167        }
168
169        @Override
170        public long getSyncedLength() {
171          return w.getSyncedLength();
172        }
173      };
174    }
175  }
176
177  /**
178   * Reproduce locking up that happens when we get an exceptions appending and syncing. See
179   * HBASE-14317. First I need to set up some mocks for Server and RegionServerServices. I also need
180   * to set up a dodgy WAL that will throw an exception when we go to append to it.
181   */
182  @Test
183  public void testLockupAroundBadAssignSync() throws IOException {
184    // Make up mocked server and services.
185    RegionServerServices services = mock(RegionServerServices.class);
186    when(services.getConfiguration()).thenReturn(CONF);
187    when(services.isStopped()).thenReturn(false);
188    when(services.isAborted()).thenReturn(false);
189    // OK. Now I have my mocked up Server and RegionServerServices and my dodgy WAL, go ahead with
190    // the test.
191    FileSystem fs = FileSystem.get(CONF);
192    Path rootDir = new Path(dir + getName());
193    fs.mkdirs(new Path(rootDir, getName()));
194    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, (Server) services, rootDir, getName(), CONF);
195    dodgyWAL.init();
196    LogRoller logRoller = new LogRoller(services);
197    logRoller.addWAL(dodgyWAL);
198    logRoller.start();
199
200    boolean threwOnSync = false;
201    boolean threwOnAppend = false;
202    boolean threwOnBoth = false;
203
204    HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
205    try {
206      // Get some random bytes.
207      byte[] value = Bytes.toBytes(getName());
208      try {
209        // First get something into memstore
210        Put put = new Put(value);
211        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), value);
212        region.put(put);
213      } catch (IOException ioe) {
214        fail();
215      }
216      long rollsCount = dodgyWAL.rolls.get();
217      try {
218        dodgyWAL.throwAppendException = true;
219        dodgyWAL.throwSyncException = false;
220        Put put = new Put(value);
221        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("3"), value);
222        region.put(put);
223      } catch (IOException ioe) {
224        threwOnAppend = true;
225      }
226      while (rollsCount == dodgyWAL.rolls.get()) {
227        Threads.sleep(100);
228      }
229      rollsCount = dodgyWAL.rolls.get();
230
231      // When we get to here.. we should be ok. A new WAL has been put in place. There were no
232      // appends to sync. We should be able to continue.
233
234      try {
235        dodgyWAL.throwAppendException = true;
236        dodgyWAL.throwSyncException = true;
237        Put put = new Put(value);
238        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("4"), value);
239        region.put(put);
240      } catch (IOException ioe) {
241        threwOnBoth = true;
242      }
243      while (rollsCount == dodgyWAL.rolls.get()) {
244        Threads.sleep(100);
245      }
246
247      // Again, all should be good. New WAL and no outstanding unsync'd edits so we should be able
248      // to just continue.
249
250      // So, should be no abort at this stage. Verify.
251      verify(services, atLeast(0)).abort(anyString(), any(Throwable.class));
252      try {
253        dodgyWAL.throwAppendException = false;
254        dodgyWAL.throwSyncException = true;
255        Put put = new Put(value);
256        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("2"), value);
257        region.rsServices = services;
258        region.put(put);
259      } catch (IOException ioe) {
260        threwOnSync = true;
261      }
262
263      region.rsServices = null;
264      // An append in the WAL but the sync failed is a server abort condition. That is our
265      // current semantic. Verify.
266      verify(services, atLeast(1)).abort(anyString(), any());
267      try {
268        dodgyWAL.throwAppendException = false;
269        dodgyWAL.throwSyncException = false;
270        dodgyWAL.throwArchiveException = true;
271        Pair<Path, Long> pair = new Pair<Path, Long>();
272        pair.setFirst(new Path("/a/b/"));
273        pair.setSecond(100L);
274        dodgyWAL.archive(pair);
275      } catch (Throwable ioe) {
276      }
277      while (true) {
278        try {
279          // one more abort needs to be called
280          verify(services, atLeast(2)).abort(anyString(), any());
281          break;
282        } catch (WantedButNotInvoked t) {
283          Threads.sleep(1);
284        }
285      }
286    } finally {
287      // To stop logRoller, its server has to say it is stopped.
288      when(services.isStopped()).thenReturn(true);
289      if (logRoller != null) logRoller.close();
290      if (region != null) {
291        try {
292          region.close(true);
293        } catch (DroppedSnapshotException e) {
294          LOG.info("On way out; expected!", e);
295        }
296      }
297      if (dodgyWAL != null) dodgyWAL.close();
298      assertTrue(threwOnBoth, "The regionserver should have thrown an exception");
299      assertTrue(threwOnAppend, "The regionserver should have thrown an exception");
300      assertTrue(threwOnSync, "The regionserver should have thrown an exception");
301    }
302  }
303
304  /**
305   * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)}
306   *         when done.
307   */
308  public static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
309    Configuration conf, WAL wal) throws IOException {
310    ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null,
311      MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
312    return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
313      Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
314  }
315}