001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertFalse; 022import static org.junit.Assert.assertTrue; 023 024import java.io.IOException; 025import java.time.Duration; 026import java.util.List; 027import org.apache.hadoop.fs.Path; 028import org.apache.hadoop.hbase.Cell; 029import org.apache.hadoop.hbase.HBaseClassTestRule; 030import org.apache.hadoop.hbase.HBaseTestingUtil; 031import org.apache.hadoop.hbase.HConstants; 032import org.apache.hadoop.hbase.ServerName; 033import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 034import org.apache.hadoop.hbase.TableName; 035import org.apache.hadoop.hbase.Waiter; 036import org.apache.hadoop.hbase.client.Get; 037import org.apache.hadoop.hbase.client.Put; 038import org.apache.hadoop.hbase.client.RegionInfo; 039import org.apache.hadoop.hbase.client.Result; 040import org.apache.hadoop.hbase.client.Table; 041import org.apache.hadoop.hbase.master.region.MasterRegionFactory; 042import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore; 043import org.apache.hadoop.hbase.regionserver.HRegionServer; 044import org.apache.hadoop.hbase.testclassification.LargeTests; 045import org.apache.hadoop.hbase.util.Bytes; 046import org.apache.hadoop.hbase.util.CommonFSUtils; 047import org.apache.hadoop.hbase.zookeeper.ZKUtil; 048import org.junit.Before; 049import org.junit.ClassRule; 050import org.junit.Rule; 051import org.junit.Test; 052import org.junit.experimental.categories.Category; 053import org.junit.rules.TestName; 054 055/** 056 * Test reuse storefiles within data directory when cluster failover with a set of new region 057 * servers with different hostnames with or without WALs and Zookeeper ZNodes, the master and 058 * cluster should fail respectively if there is any situation considered as not supported. 059 */ 060@Category({ LargeTests.class }) 061public class TestRecreateCluster { 062 @ClassRule 063 public static final HBaseClassTestRule CLASS_RULE = 064 HBaseClassTestRule.forClass(TestRecreateCluster.class); 065 066 @Rule 067 public TestName name = new TestName(); 068 069 private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 070 private static final int NUM_RS = 3; 071 private static final long TIMEOUT_MS = Duration.ofMinutes(1).toMillis(); 072 private static final long MASTER_INIT_TIMEOUT_MS = Duration.ofSeconds(45).toMillis(); 073 074 @Before 075 public void setup() { 076 TEST_UTIL.getConfiguration().setLong("hbase.master.init.timeout.localHBaseCluster", 077 MASTER_INIT_TIMEOUT_MS); 078 } 079 080 @Test 081 public void testRecreateCluster_UserTableDisabled_ReuseWALsAndZNodes() throws Exception { 082 validateRecreateClusterWithUserDisabled(false, false); 083 } 084 085 @Test 086 public void testRecreateCluster_UserTableEnabled_ReuseWALsAndZNodes() throws Exception { 087 validateRecreateClusterWithUserTableEnabled(false, false); 088 } 089 090 @Test 091 public void testRecreateCluster_UserTableEnabled_CleanupZNodes() throws Exception { 092 // this is no longer failing and is a different behavior compared to branch-2 093 validateRecreateClusterWithUserTableEnabled(false, true); 094 } 095 096 @Test(expected = IOException.class) 097 public void testRecreateCluster_UserTableEnabled_CleanupWALAndZNodes() throws Exception { 098 // master fails with InitMetaProcedure because it cannot delete existing meta table directory, 099 // region server cannot join and time-out the cluster starts. 100 validateRecreateClusterWithUserTableEnabled(true, true); 101 } 102 103 private void validateRecreateClusterWithUserDisabled(boolean cleanupWALs, boolean cleanUpZNodes) 104 throws Exception { 105 TEST_UTIL.startMiniCluster(NUM_RS); 106 try { 107 TableName tableName = TableName.valueOf("t1"); 108 prepareDataBeforeRecreate(TEST_UTIL, tableName); 109 TEST_UTIL.getAdmin().disableTable(tableName); 110 TEST_UTIL.waitTableDisabled(tableName.getName()); 111 restartHBaseCluster(cleanupWALs, cleanUpZNodes); 112 TEST_UTIL.getAdmin().enableTable(tableName); 113 validateDataAfterRecreate(TEST_UTIL, tableName); 114 } finally { 115 TEST_UTIL.shutdownMiniCluster(); 116 } 117 } 118 119 private void validateRecreateClusterWithUserTableEnabled(boolean cleanupWALs, 120 boolean cleanUpZNodes) throws Exception { 121 TEST_UTIL.startMiniCluster(NUM_RS); 122 try { 123 TableName tableName = TableName.valueOf("t1"); 124 prepareDataBeforeRecreate(TEST_UTIL, tableName); 125 restartHBaseCluster(cleanupWALs, cleanUpZNodes); 126 validateDataAfterRecreate(TEST_UTIL, tableName); 127 } finally { 128 TEST_UTIL.shutdownMiniCluster(); 129 } 130 } 131 132 private void restartHBaseCluster(boolean cleanUpWALs, boolean cleanUpZnodes) throws Exception { 133 // flush cache so that everything is on disk 134 TEST_UTIL.getMiniHBaseCluster().flushcache(TableName.META_TABLE_NAME); 135 TEST_UTIL.getMiniHBaseCluster().flushcache(); 136 137 List<ServerName> oldServers = 138 TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList(); 139 140 // make sure there is no procedures pending 141 TEST_UTIL.waitFor(TIMEOUT_MS, () -> TEST_UTIL.getHBaseCluster().getMaster().getProcedures() 142 .stream().filter(p -> p.isFinished()).findAny().isPresent()); 143 144 // shutdown and delete data if needed 145 Path walRootDirPath = TEST_UTIL.getMiniHBaseCluster().getMaster().getWALRootDir(); 146 Path rootDirPath = CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration()); 147 TEST_UTIL.shutdownMiniHBaseCluster(); 148 149 if (cleanUpWALs) { 150 TEST_UTIL.getDFSCluster().getFileSystem() 151 .delete(new Path(rootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true); 152 TEST_UTIL.getDFSCluster().getFileSystem() 153 .delete(new Path(walRootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true); 154 TEST_UTIL.getDFSCluster().getFileSystem() 155 .delete(new Path(walRootDirPath, WALProcedureStore.MASTER_PROCEDURE_LOGDIR), true); 156 157 TEST_UTIL.getDFSCluster().getFileSystem() 158 .delete(new Path(walRootDirPath, HConstants.HREGION_LOGDIR_NAME), true); 159 TEST_UTIL.getDFSCluster().getFileSystem() 160 .delete(new Path(walRootDirPath, HConstants.HREGION_OLDLOGDIR_NAME), true); 161 } 162 163 if (cleanUpZnodes) { 164 // delete all zk data 165 // we cannot keep ZK data because it will hold the meta region states as open and 166 // didn't submit a InitMetaProcedure 167 ZKUtil.deleteChildrenRecursively(TEST_UTIL.getZooKeeperWatcher(), 168 TEST_UTIL.getZooKeeperWatcher().getZNodePaths().baseZNode); 169 TEST_UTIL.shutdownMiniZKCluster(); 170 TEST_UTIL.startMiniZKCluster(); 171 } 172 173 TEST_UTIL.restartHBaseCluster(NUM_RS); 174 TEST_UTIL.waitFor(TIMEOUT_MS, new Waiter.Predicate<Exception>() { 175 @Override 176 public boolean evaluate() throws Exception { 177 return TEST_UTIL.getMiniHBaseCluster().getNumLiveRegionServers() == NUM_RS; 178 } 179 }); 180 181 // make sure we have a new set of region servers with different hostnames and ports 182 List<ServerName> newServers = 183 TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList(); 184 assertFalse(newServers.stream().filter((newServer) -> oldServers.contains(newServer)).findAny() 185 .isPresent()); 186 } 187 188 private void prepareDataBeforeRecreate(HBaseTestingUtil testUtil, TableName tableName) 189 throws Exception { 190 Table table = testUtil.createTable(tableName, "f"); 191 Put put = new Put(Bytes.toBytes("r1")); 192 put.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes("v")); 193 table.put(put); 194 195 ensureTableNotColocatedWithSystemTable(tableName, TableName.META_TABLE_NAME); 196 } 197 198 private void ensureTableNotColocatedWithSystemTable(TableName userTable, TableName systemTable) 199 throws IOException, InterruptedException { 200 SingleProcessHBaseCluster hbaseCluster = TEST_UTIL.getHBaseCluster(); 201 assertTrue("Please start more than 1 regionserver", 202 hbaseCluster.getRegionServerThreads().size() > 1); 203 204 int userTableServerNum = getServerNumForTableWithOnlyOneRegion(userTable); 205 int systemTableServerNum = getServerNumForTableWithOnlyOneRegion(systemTable); 206 207 if (userTableServerNum != systemTableServerNum) { 208 // no-ops if user table and system are already on a different host 209 return; 210 } 211 212 int destServerNum = (systemTableServerNum + 1) % NUM_RS; 213 assertTrue(systemTableServerNum != destServerNum); 214 215 HRegionServer systemTableServer = hbaseCluster.getRegionServer(systemTableServerNum); 216 HRegionServer destServer = hbaseCluster.getRegionServer(destServerNum); 217 assertTrue(!systemTableServer.equals(destServer)); 218 // make sure the dest server is live before moving region 219 hbaseCluster.waitForRegionServerToStart(destServer.getServerName().getHostname(), 220 destServer.getServerName().getPort(), TIMEOUT_MS); 221 // move region of userTable to a different regionserver not co-located with system table 222 TEST_UTIL.moveRegionAndWait(TEST_UTIL.getAdmin().getRegions(userTable).get(0), 223 destServer.getServerName()); 224 } 225 226 private int getServerNumForTableWithOnlyOneRegion(TableName tableName) throws IOException { 227 List<RegionInfo> tableRegionInfos = TEST_UTIL.getAdmin().getRegions(tableName); 228 assertEquals(1, tableRegionInfos.size()); 229 return TEST_UTIL.getHBaseCluster().getServerWith(tableRegionInfos.get(0).getRegionName()); 230 } 231 232 private void validateDataAfterRecreate(HBaseTestingUtil testUtil, TableName tableName) 233 throws Exception { 234 Table t1 = testUtil.getConnection().getTable(tableName); 235 Get get = new Get(Bytes.toBytes("r1")); 236 get.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c")); 237 Result result = t1.get(get); 238 assertTrue(result.advance()); 239 Cell cell = result.current(); 240 assertEquals("v", 241 Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength())); 242 assertFalse(result.advance()); 243 } 244 245}