001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.wal; 019 020import java.io.IOException; 021import java.util.stream.Stream; 022import org.apache.hadoop.fs.Path; 023import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate; 024import org.apache.hadoop.hbase.HBaseTestingUtil; 025import org.apache.hadoop.hbase.HConstants; 026import org.apache.hadoop.hbase.regionserver.HRegionServer; 027import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL; 028import org.apache.hadoop.hbase.testclassification.LargeTests; 029import org.apache.hadoop.hbase.testclassification.RegionServerTests; 030import org.apache.hadoop.hbase.util.CommonFSUtils; 031import org.junit.jupiter.api.AfterAll; 032import org.junit.jupiter.api.AfterEach; 033import org.junit.jupiter.api.BeforeAll; 034import org.junit.jupiter.api.BeforeEach; 035import org.junit.jupiter.api.Tag; 036import org.junit.jupiter.api.TestTemplate; 037import org.junit.jupiter.params.provider.Arguments; 038 039@Tag(RegionServerTests.TAG) 040@Tag(LargeTests.TAG) 041@HBaseParameterizedTestTemplate 042public class TestWALOpenAfterDNRollingStart { 043 044 private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 045 // Sleep time before restart next dn, we need to wait the current dn to finish start up 046 private static long DN_RESTART_INTERVAL = 15000; 047 048 // interval of checking low replication. The sleep time must smaller than 049 // DataNodeRestartInterval 050 // so a low replication case will be detected and the wal will be rolled 051 private static long CHECK_LOW_REPLICATION_INTERVAL = 10000; 052 053 public String walProvider; 054 055 public TestWALOpenAfterDNRollingStart(String walProvider) { 056 this.walProvider = walProvider; 057 } 058 059 public static Stream<Arguments> parameters() { 060 return Stream.of(Arguments.of("asyncfs"), Arguments.of("filesystem")); 061 } 062 063 @BeforeAll 064 public static void setUpBeforeClass() throws Exception { 065 // don't let hdfs client to choose a new replica when dn down 066 TEST_UTIL.getConfiguration() 067 .setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable", false); 068 TEST_UTIL.getConfiguration().setLong("hbase.regionserver.hlog.check.lowreplication.interval", 069 CHECK_LOW_REPLICATION_INTERVAL); 070 TEST_UTIL.startMiniDFSCluster(3); 071 TEST_UTIL.startMiniZKCluster(); 072 } 073 074 @BeforeEach 075 public void setUp() throws IOException, InterruptedException { 076 TEST_UTIL.getConfiguration().set("hbase.wal.provider", walProvider); 077 TEST_UTIL.startMiniHBaseCluster(); 078 } 079 080 @AfterEach 081 public void tearDown() throws Exception { 082 TEST_UTIL.shutdownMiniHBaseCluster(); 083 } 084 085 @AfterAll 086 public static void tearDownAfterClass() throws Exception { 087 TEST_UTIL.shutdownMiniCluster(); 088 } 089 090 /** 091 * see HBASE-18132 This is a test case of failing open a wal(for replication for example) after 092 * all datanode restarted (rolling upgrade, for example). Before this patch, low replication 093 * detection is only used when syncing wal. But if the wal haven't had any entry whiten, it will 094 * never know all the replica of the wal is broken(because of dn restarting). And this wal can 095 * never be open 096 */ 097 @TestTemplate 098 public void test() throws Exception { 099 HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0); 100 AbstractFSWAL<?> wal = (AbstractFSWAL<?>) server.getWAL(null); 101 Path currentFile = wal.getCurrentFileName(); 102 // restart every dn to simulate a dn rolling upgrade 103 for (int i = 0, n = TEST_UTIL.getDFSCluster().getDataNodes().size(); i < n; i++) { 104 // This is NOT a bug, when restart dn in miniDFSCluster, it will remove the stopped dn from 105 // the dn list and then add to the tail of this list, we need to always restart the first one 106 // to simulate rolling upgrade of every dn. 107 TEST_UTIL.getDFSCluster().restartDataNode(0); 108 // sleep enough time so log roller can detect the pipeline break and roll log 109 Thread.sleep(DN_RESTART_INTERVAL); 110 } 111 112 if (!server.getFileSystem().exists(currentFile)) { 113 Path walRootDir = CommonFSUtils.getWALRootDir(TEST_UTIL.getConfiguration()); 114 final Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME); 115 currentFile = new Path(oldLogDir, currentFile.getName()); 116 } 117 // if the log is not rolled, then we can never open this wal forever. 118 try (WALStreamReader reader = NoEOFWALStreamReader.create(TEST_UTIL.getTestFileSystem(), 119 currentFile, TEST_UTIL.getConfiguration())) { 120 reader.next(); 121 } 122 } 123}