001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.jupiter.api.Assertions.assertEquals; 021import static org.junit.jupiter.api.Assertions.assertFalse; 022import static org.junit.jupiter.api.Assertions.assertTrue; 023 024import java.io.IOException; 025import java.time.Duration; 026import java.util.List; 027import org.apache.hadoop.fs.Path; 028import org.apache.hadoop.hbase.Cell; 029import org.apache.hadoop.hbase.HBaseTestingUtil; 030import org.apache.hadoop.hbase.ServerName; 031import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 032import org.apache.hadoop.hbase.StartTestingClusterOption; 033import org.apache.hadoop.hbase.TableName; 034import org.apache.hadoop.hbase.Waiter; 035import org.apache.hadoop.hbase.client.Get; 036import org.apache.hadoop.hbase.client.Put; 037import org.apache.hadoop.hbase.client.RegionInfo; 038import org.apache.hadoop.hbase.client.Result; 039import org.apache.hadoop.hbase.client.Table; 040import org.apache.hadoop.hbase.regionserver.HRegionServer; 041import org.apache.hadoop.hbase.testclassification.LargeTests; 042import org.apache.hadoop.hbase.util.Bytes; 043import org.apache.hadoop.hbase.util.CommonFSUtils; 044import org.apache.hadoop.hbase.zookeeper.ZKUtil; 045import org.junit.jupiter.api.AfterEach; 046import org.junit.jupiter.api.BeforeEach; 047import org.junit.jupiter.api.Tag; 048import org.junit.jupiter.api.Test; 049 050/** 051 * Test reuse storefiles within data directory when cluster failover with a set of new region 052 * servers with different hostnames with or without WALs and Zookeeper ZNodes, the master and 053 * cluster should fail respectively if there is any situation considered as not supported. 054 */ 055@Tag(LargeTests.TAG) 056public class TestRecreateCluster { 057 058 private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 059 private static final int NUM_RS = 3; 060 private static final long TIMEOUT_MS = Duration.ofMinutes(1).toMillis(); 061 private static final long MASTER_INIT_TIMEOUT_MS = Duration.ofSeconds(45).toMillis(); 062 063 @BeforeEach 064 public void setup() throws Exception { 065 TEST_UTIL.getConfiguration().setLong("hbase.master.init.timeout.localHBaseCluster", 066 MASTER_INIT_TIMEOUT_MS); 067 TEST_UTIL.startMiniCluster(StartTestingClusterOption.builder().numRegionServers(NUM_RS) 068 .numDataNodes(NUM_RS).createWALDir(true).build()); 069 } 070 071 @AfterEach 072 public void tearDown() throws Exception { 073 TEST_UTIL.shutdownMiniCluster(); 074 } 075 076 @Test 077 public void testRecreateCluster_UserTableDisabled_ReuseWALsAndZNodes() throws Exception { 078 validateRecreateClusterWithUserDisabled(false, false); 079 } 080 081 @Test 082 public void testRecreateCluster_UserTableEnabled_ReuseWALsAndZNodes() throws Exception { 083 validateRecreateClusterWithUserTableEnabled(false, false); 084 } 085 086 @Test 087 public void testRecreateCluster_UserTableEnabled_CleanupZNodes() throws Exception { 088 // this is no longer failing because master region stores the information the region servers 089 // as long as it's gracefully flushed before shutdown 090 validateRecreateClusterWithUserTableEnabled(false, true); 091 } 092 093 @Test 094 public void testRecreateCluster_UserTableEnabled_CleanupWALAndZNodes() throws Exception { 095 validateRecreateClusterWithUserTableEnabled(true, true); 096 } 097 098 private void validateRecreateClusterWithUserDisabled(boolean cleanupWALs, boolean cleanUpZNodes) 099 throws Exception { 100 TableName tableName = TableName.valueOf("t1"); 101 prepareDataBeforeRecreate(TEST_UTIL, tableName); 102 TEST_UTIL.getAdmin().disableTable(tableName); 103 TEST_UTIL.waitTableDisabled(tableName.getName()); 104 restartHBaseCluster(cleanupWALs, cleanUpZNodes); 105 TEST_UTIL.getAdmin().enableTable(tableName); 106 validateDataAfterRecreate(TEST_UTIL, tableName); 107 } 108 109 private void validateRecreateClusterWithUserTableEnabled(boolean cleanupWALs, 110 boolean cleanUpZNodes) throws Exception { 111 TableName tableName = TableName.valueOf("t1"); 112 prepareDataBeforeRecreate(TEST_UTIL, tableName); 113 restartHBaseCluster(cleanupWALs, cleanUpZNodes); 114 validateDataAfterRecreate(TEST_UTIL, tableName); 115 } 116 117 private void restartHBaseCluster(boolean cleanUpWALs, boolean cleanUpZnodes) throws Exception { 118 // flush cache so that everything is on disk 119 TEST_UTIL.getMiniHBaseCluster().flushcache(TableName.META_TABLE_NAME); 120 TEST_UTIL.getMiniHBaseCluster().flushcache(); 121 122 List<ServerName> oldServers = 123 TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList(); 124 125 // make sure there is no procedures pending 126 TEST_UTIL.waitFor(TIMEOUT_MS, () -> TEST_UTIL.getHBaseCluster().getMaster().getProcedures() 127 .stream().filter(p -> p.isFinished()).findAny().isPresent()); 128 129 // shutdown and delete data if needed 130 Path walRootDirPath = TEST_UTIL.getMiniHBaseCluster().getMaster().getWALRootDir(); 131 Path rootDirPath = CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration()); 132 TEST_UTIL.shutdownMiniHBaseCluster(); 133 134 if (cleanUpWALs) { 135 TEST_UTIL.getDFSCluster().getFileSystem().delete(walRootDirPath, true); 136 } 137 138 if (cleanUpZnodes) { 139 // delete all zk data 140 // we cannot keep ZK data because it will hold the meta region states as open and 141 // didn't submit a InitMetaProcedure 142 ZKUtil.deleteChildrenRecursively(TEST_UTIL.getZooKeeperWatcher(), 143 TEST_UTIL.getZooKeeperWatcher().getZNodePaths().baseZNode); 144 TEST_UTIL.shutdownMiniZKCluster(); 145 TEST_UTIL.startMiniZKCluster(); 146 } 147 148 TEST_UTIL.restartHBaseCluster(NUM_RS); 149 TEST_UTIL.waitFor(TIMEOUT_MS, new Waiter.Predicate<Exception>() { 150 @Override 151 public boolean evaluate() throws Exception { 152 return TEST_UTIL.getMiniHBaseCluster().getNumLiveRegionServers() == NUM_RS; 153 } 154 }); 155 156 // make sure we have a new set of region servers with different hostnames and ports 157 List<ServerName> newServers = 158 TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList(); 159 assertFalse(newServers.stream().filter((newServer) -> oldServers.contains(newServer)).findAny() 160 .isPresent()); 161 } 162 163 private void prepareDataBeforeRecreate(HBaseTestingUtil testUtil, TableName tableName) 164 throws Exception { 165 Table table = testUtil.createTable(tableName, "f"); 166 Put put = new Put(Bytes.toBytes("r1")); 167 put.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes("v")); 168 table.put(put); 169 170 ensureTableNotColocatedWithSystemTable(tableName, TableName.META_TABLE_NAME); 171 } 172 173 private void ensureTableNotColocatedWithSystemTable(TableName userTable, TableName systemTable) 174 throws IOException, InterruptedException { 175 SingleProcessHBaseCluster hbaseCluster = TEST_UTIL.getHBaseCluster(); 176 assertTrue(hbaseCluster.getRegionServerThreads().size() > 1, 177 "Please start more than 1 regionserver"); 178 int userTableServerNum = getServerNumForTableWithOnlyOneRegion(userTable); 179 int systemTableServerNum = getServerNumForTableWithOnlyOneRegion(systemTable); 180 181 if (userTableServerNum != systemTableServerNum) { 182 // no-ops if user table and system are already on a different host 183 return; 184 } 185 186 int destServerNum = (systemTableServerNum + 1) % NUM_RS; 187 assertTrue(systemTableServerNum != destServerNum); 188 189 HRegionServer systemTableServer = hbaseCluster.getRegionServer(systemTableServerNum); 190 HRegionServer destServer = hbaseCluster.getRegionServer(destServerNum); 191 assertTrue(!systemTableServer.equals(destServer)); 192 // make sure the dest server is live before moving region 193 hbaseCluster.waitForRegionServerToStart(destServer.getServerName().getHostname(), 194 destServer.getServerName().getPort(), TIMEOUT_MS); 195 // move region of userTable to a different regionserver not co-located with system table 196 TEST_UTIL.moveRegionAndWait(TEST_UTIL.getAdmin().getRegions(userTable).get(0), 197 destServer.getServerName()); 198 } 199 200 private int getServerNumForTableWithOnlyOneRegion(TableName tableName) throws IOException { 201 List<RegionInfo> tableRegionInfos = TEST_UTIL.getAdmin().getRegions(tableName); 202 assertEquals(1, tableRegionInfos.size()); 203 return TEST_UTIL.getHBaseCluster().getServerWith(tableRegionInfos.get(0).getRegionName()); 204 } 205 206 private void validateDataAfterRecreate(HBaseTestingUtil testUtil, TableName tableName) 207 throws Exception { 208 Table t1 = testUtil.getConnection().getTable(tableName); 209 Get get = new Get(Bytes.toBytes("r1")); 210 get.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c")); 211 Result result = t1.get(get); 212 assertTrue(result.advance()); 213 Cell cell = result.current(); 214 assertEquals("v", 215 Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength())); 216 assertFalse(result.advance()); 217 } 218}