001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.wal;
019
020import java.io.IOException;
021import java.util.stream.Stream;
022import org.apache.hadoop.fs.Path;
023import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate;
024import org.apache.hadoop.hbase.HBaseTestingUtil;
025import org.apache.hadoop.hbase.HConstants;
026import org.apache.hadoop.hbase.regionserver.HRegionServer;
027import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
028import org.apache.hadoop.hbase.testclassification.LargeTests;
029import org.apache.hadoop.hbase.testclassification.RegionServerTests;
030import org.apache.hadoop.hbase.util.CommonFSUtils;
031import org.junit.jupiter.api.AfterAll;
032import org.junit.jupiter.api.AfterEach;
033import org.junit.jupiter.api.BeforeAll;
034import org.junit.jupiter.api.BeforeEach;
035import org.junit.jupiter.api.Tag;
036import org.junit.jupiter.api.TestTemplate;
037import org.junit.jupiter.params.provider.Arguments;
038
039@Tag(RegionServerTests.TAG)
040@Tag(LargeTests.TAG)
041@HBaseParameterizedTestTemplate
042public class TestWALOpenAfterDNRollingStart {
043
044  private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
045  // Sleep time before restart next dn, we need to wait the current dn to finish start up
046  private static long DN_RESTART_INTERVAL = 15000;
047
048  // interval of checking low replication. The sleep time must smaller than
049  // DataNodeRestartInterval
050  // so a low replication case will be detected and the wal will be rolled
051  private static long CHECK_LOW_REPLICATION_INTERVAL = 10000;
052
053  public String walProvider;
054
055  public TestWALOpenAfterDNRollingStart(String walProvider) {
056    this.walProvider = walProvider;
057  }
058
059  public static Stream<Arguments> parameters() {
060    return Stream.of(Arguments.of("asyncfs"), Arguments.of("filesystem"));
061  }
062
063  @BeforeAll
064  public static void setUpBeforeClass() throws Exception {
065    // don't let hdfs client to choose a new replica when dn down
066    TEST_UTIL.getConfiguration()
067      .setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable", false);
068    TEST_UTIL.getConfiguration().setLong("hbase.regionserver.hlog.check.lowreplication.interval",
069      CHECK_LOW_REPLICATION_INTERVAL);
070    TEST_UTIL.startMiniDFSCluster(3);
071    TEST_UTIL.startMiniZKCluster();
072  }
073
074  @BeforeEach
075  public void setUp() throws IOException, InterruptedException {
076    TEST_UTIL.getConfiguration().set("hbase.wal.provider", walProvider);
077    TEST_UTIL.startMiniHBaseCluster();
078  }
079
080  @AfterEach
081  public void tearDown() throws Exception {
082    TEST_UTIL.shutdownMiniHBaseCluster();
083  }
084
085  @AfterAll
086  public static void tearDownAfterClass() throws Exception {
087    TEST_UTIL.shutdownMiniCluster();
088  }
089
090  /**
091   * see HBASE-18132 This is a test case of failing open a wal(for replication for example) after
092   * all datanode restarted (rolling upgrade, for example). Before this patch, low replication
093   * detection is only used when syncing wal. But if the wal haven't had any entry whiten, it will
094   * never know all the replica of the wal is broken(because of dn restarting). And this wal can
095   * never be open
096   */
097  @TestTemplate
098  public void test() throws Exception {
099    HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0);
100    AbstractFSWAL<?> wal = (AbstractFSWAL<?>) server.getWAL(null);
101    Path currentFile = wal.getCurrentFileName();
102    // restart every dn to simulate a dn rolling upgrade
103    for (int i = 0, n = TEST_UTIL.getDFSCluster().getDataNodes().size(); i < n; i++) {
104      // This is NOT a bug, when restart dn in miniDFSCluster, it will remove the stopped dn from
105      // the dn list and then add to the tail of this list, we need to always restart the first one
106      // to simulate rolling upgrade of every dn.
107      TEST_UTIL.getDFSCluster().restartDataNode(0);
108      // sleep enough time so log roller can detect the pipeline break and roll log
109      Thread.sleep(DN_RESTART_INTERVAL);
110    }
111
112    if (!server.getFileSystem().exists(currentFile)) {
113      Path walRootDir = CommonFSUtils.getWALRootDir(TEST_UTIL.getConfiguration());
114      final Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
115      currentFile = new Path(oldLogDir, currentFile.getName());
116    }
117    // if the log is not rolled, then we can never open this wal forever.
118    try (WALStreamReader reader = NoEOFWALStreamReader.create(TEST_UTIL.getTestFileSystem(),
119      currentFile, TEST_UTIL.getConfiguration())) {
120      reader.next();
121    }
122  }
123}