001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.Assert.assertTrue;
021import static org.junit.Assert.fail;
022import static org.mockito.ArgumentMatchers.any;
023import static org.mockito.ArgumentMatchers.anyString;
024import static org.mockito.Mockito.atLeast;
025import static org.mockito.Mockito.mock;
026import static org.mockito.Mockito.verify;
027import static org.mockito.Mockito.when;
028
029import java.io.IOException;
030import java.util.List;
031import java.util.Map;
032import java.util.concurrent.atomic.AtomicLong;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.FileSystem;
035import org.apache.hadoop.fs.Path;
036import org.apache.hadoop.hbase.DroppedSnapshotException;
037import org.apache.hadoop.hbase.HBaseClassTestRule;
038import org.apache.hadoop.hbase.HBaseTestingUtil;
039import org.apache.hadoop.hbase.HConstants;
040import org.apache.hadoop.hbase.Server;
041import org.apache.hadoop.hbase.TableName;
042import org.apache.hadoop.hbase.client.Durability;
043import org.apache.hadoop.hbase.client.Put;
044import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
045import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
046import org.apache.hadoop.hbase.testclassification.SmallTests;
047import org.apache.hadoop.hbase.util.Bytes;
048import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
049import org.apache.hadoop.hbase.util.Pair;
050import org.apache.hadoop.hbase.util.Threads;
051import org.apache.hadoop.hbase.wal.WAL;
052import org.apache.hadoop.hbase.wal.WALProvider.Writer;
053import org.junit.After;
054import org.junit.Before;
055import org.junit.ClassRule;
056import org.junit.Rule;
057import org.junit.Test;
058import org.junit.experimental.categories.Category;
059import org.junit.rules.TestName;
060import org.mockito.exceptions.verification.WantedButNotInvoked;
061import org.slf4j.Logger;
062import org.slf4j.LoggerFactory;
063
064/**
065 * Testing sync/append failures. Copied from TestHRegion.
066 */
067@Category({ SmallTests.class })
068public class TestFailedAppendAndSync {
069
070  @ClassRule
071  public static final HBaseClassTestRule CLASS_RULE =
072    HBaseClassTestRule.forClass(TestFailedAppendAndSync.class);
073
074  private static final Logger LOG = LoggerFactory.getLogger(TestFailedAppendAndSync.class);
075  @Rule
076  public TestName name = new TestName();
077
078  private static final String COLUMN_FAMILY = "MyCF";
079  private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
080
081  HRegion region = null;
082  // Do not run unit tests in parallel (? Why not? It don't work? Why not? St.Ack)
083  private static HBaseTestingUtil TEST_UTIL;
084  public static Configuration CONF;
085  private String dir;
086
087  // Test names
088  protected TableName tableName;
089
090  @Before
091  public void setup() throws IOException {
092    TEST_UTIL = new HBaseTestingUtil();
093    CONF = TEST_UTIL.getConfiguration();
094    // Disable block cache.
095    CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
096    dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
097    tableName = TableName.valueOf(name.getMethodName());
098  }
099
100  @After
101  public void tearDown() throws Exception {
102    EnvironmentEdgeManagerTestHelper.reset();
103    LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
104    TEST_UTIL.cleanupTestDir();
105  }
106
107  String getName() {
108    return name.getMethodName();
109  }
110
111  // Dodgy WAL. Will throw exceptions when flags set.
112  class DodgyFSLog extends FSHLog {
113    volatile boolean throwSyncException = false;
114    volatile boolean throwAppendException = false;
115    volatile boolean throwArchiveException = false;
116
117    final AtomicLong rolls = new AtomicLong(0);
118
119    public DodgyFSLog(FileSystem fs, Server server, Path root, String logDir, Configuration conf)
120      throws IOException {
121      super(fs, server, root, logDir, conf);
122    }
123
124    @Override
125    public Map<byte[], List<byte[]>> rollWriter(boolean force)
126      throws FailedLogCloseException, IOException {
127      Map<byte[], List<byte[]>> regions = super.rollWriter(force);
128      rolls.getAndIncrement();
129      return regions;
130    }
131
132    @Override
133    protected void archiveLogFile(Path p) throws IOException {
134      if (throwArchiveException) {
135        throw new IOException("throw archival exception");
136      }
137    }
138
139    @Override
140    protected void archive(Pair<Path, Long> localLogsToArchive) {
141      super.archive(localLogsToArchive);
142    }
143
144    @Override
145    protected Writer createWriterInstance(Path path) throws IOException {
146      final Writer w = super.createWriterInstance(path);
147      return new Writer() {
148        @Override
149        public void close() throws IOException {
150          w.close();
151        }
152
153        @Override
154        public void sync(boolean forceSync) throws IOException {
155          if (throwSyncException) {
156            throw new IOException("FAKE! Failed to replace a bad datanode...");
157          }
158          w.sync(forceSync);
159        }
160
161        @Override
162        public void append(Entry entry) throws IOException {
163          if (throwAppendException) {
164            throw new IOException("FAKE! Failed to replace a bad datanode...");
165          }
166          w.append(entry);
167        }
168
169        @Override
170        public long getLength() {
171          return w.getLength();
172        }
173
174        @Override
175        public long getSyncedLength() {
176          return w.getSyncedLength();
177        }
178      };
179    }
180  }
181
182  /**
183   * Reproduce locking up that happens when we get an exceptions appending and syncing. See
184   * HBASE-14317. First I need to set up some mocks for Server and RegionServerServices. I also need
185   * to set up a dodgy WAL that will throw an exception when we go to append to it.
186   */
187  @Test
188  public void testLockupAroundBadAssignSync() throws IOException {
189    // Make up mocked server and services.
190    RegionServerServices services = mock(RegionServerServices.class);
191    when(services.getConfiguration()).thenReturn(CONF);
192    when(services.isStopped()).thenReturn(false);
193    when(services.isAborted()).thenReturn(false);
194    // OK. Now I have my mocked up Server and RegionServerServices and my dodgy WAL, go ahead with
195    // the test.
196    FileSystem fs = FileSystem.get(CONF);
197    Path rootDir = new Path(dir + getName());
198    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, (Server) services, rootDir, getName(), CONF);
199    dodgyWAL.init();
200    LogRoller logRoller = new LogRoller(services);
201    logRoller.addWAL(dodgyWAL);
202    logRoller.start();
203
204    boolean threwOnSync = false;
205    boolean threwOnAppend = false;
206    boolean threwOnBoth = false;
207
208    HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
209    try {
210      // Get some random bytes.
211      byte[] value = Bytes.toBytes(getName());
212      try {
213        // First get something into memstore
214        Put put = new Put(value);
215        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), value);
216        region.put(put);
217      } catch (IOException ioe) {
218        fail();
219      }
220      long rollsCount = dodgyWAL.rolls.get();
221      try {
222        dodgyWAL.throwAppendException = true;
223        dodgyWAL.throwSyncException = false;
224        Put put = new Put(value);
225        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("3"), value);
226        region.put(put);
227      } catch (IOException ioe) {
228        threwOnAppend = true;
229      }
230      while (rollsCount == dodgyWAL.rolls.get()) {
231        Threads.sleep(100);
232      }
233      rollsCount = dodgyWAL.rolls.get();
234
235      // When we get to here.. we should be ok. A new WAL has been put in place. There were no
236      // appends to sync. We should be able to continue.
237
238      try {
239        dodgyWAL.throwAppendException = true;
240        dodgyWAL.throwSyncException = true;
241        Put put = new Put(value);
242        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("4"), value);
243        region.put(put);
244      } catch (IOException ioe) {
245        threwOnBoth = true;
246      }
247      while (rollsCount == dodgyWAL.rolls.get()) {
248        Threads.sleep(100);
249      }
250
251      // Again, all should be good. New WAL and no outstanding unsync'd edits so we should be able
252      // to just continue.
253
254      // So, should be no abort at this stage. Verify.
255      verify(services, atLeast(0)).abort(anyString(), any(Throwable.class));
256      try {
257        dodgyWAL.throwAppendException = false;
258        dodgyWAL.throwSyncException = true;
259        Put put = new Put(value);
260        put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("2"), value);
261        region.put(put);
262      } catch (IOException ioe) {
263        threwOnSync = true;
264      }
265      // An append in the WAL but the sync failed is a server abort condition. That is our
266      // current semantic. Verify. It takes a while for abort to be called. Just hang here till it
267      // happens. If it don't we'll timeout the whole test. That is fine.
268      while (true) {
269        try {
270          verify(services, atLeast(1)).abort(anyString(), any(Throwable.class));
271          break;
272        } catch (WantedButNotInvoked t) {
273          Threads.sleep(1);
274        }
275      }
276
277      try {
278        dodgyWAL.throwAppendException = false;
279        dodgyWAL.throwSyncException = false;
280        dodgyWAL.throwArchiveException = true;
281        Pair<Path, Long> pair = new Pair<Path, Long>();
282        pair.setFirst(new Path("/a/b/"));
283        pair.setSecond(100L);
284        dodgyWAL.archive(pair);
285      } catch (Throwable ioe) {
286      }
287      while (true) {
288        try {
289          // one more abort needs to be called
290          verify(services, atLeast(2)).abort(anyString(), any());
291          break;
292        } catch (WantedButNotInvoked t) {
293          Threads.sleep(1);
294        }
295      }
296    } finally {
297      // To stop logRoller, its server has to say it is stopped.
298      when(services.isStopped()).thenReturn(true);
299      if (logRoller != null) logRoller.close();
300      if (region != null) {
301        try {
302          region.close(true);
303        } catch (DroppedSnapshotException e) {
304          LOG.info("On way out; expected!", e);
305        }
306      }
307      if (dodgyWAL != null) dodgyWAL.close();
308      assertTrue("The regionserver should have thrown an exception", threwOnBoth);
309      assertTrue("The regionserver should have thrown an exception", threwOnAppend);
310      assertTrue("The regionserver should have thrown an exception", threwOnSync);
311    }
312  }
313
314  /**
315   * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)}
316   *         when done.
317   */
318  public static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
319    Configuration conf, WAL wal) throws IOException {
320    ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null,
321      MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
322    return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
323      Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
324  }
325}