001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertFalse; 021import static org.junit.Assert.assertNotNull; 022import static org.junit.Assert.assertNull; 023import static org.junit.Assert.assertTrue; 024 025import java.io.IOException; 026import java.util.List; 027import java.util.Optional; 028import java.util.concurrent.CountDownLatch; 029import java.util.stream.Collectors; 030 031import org.apache.hadoop.conf.Configuration; 032import org.apache.hadoop.hbase.HBaseClassTestRule; 033import org.apache.hadoop.hbase.ServerName; 034import org.apache.hadoop.hbase.StartMiniClusterOption; 035import org.apache.hadoop.hbase.TableName; 036import org.apache.hadoop.hbase.client.RegionInfo; 037import org.apache.hadoop.hbase.client.Table; 038import org.apache.hadoop.hbase.master.assignment.AssignmentManager; 039import org.apache.hadoop.hbase.master.assignment.ServerState; 040import org.apache.hadoop.hbase.master.assignment.ServerStateNode; 041import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 042import org.apache.hadoop.hbase.procedure2.Procedure; 043import org.apache.hadoop.hbase.testclassification.LargeTests; 044import org.apache.hadoop.hbase.testclassification.MasterTests; 045import org.apache.hadoop.hbase.util.JVMClusterUtil; 046import org.apache.zookeeper.KeeperException; 047import org.junit.ClassRule; 048import org.junit.Test; 049import org.junit.experimental.categories.Category; 050import org.slf4j.Logger; 051import org.slf4j.LoggerFactory; 052 053@Category({ MasterTests.class, LargeTests.class }) 054public class TestClusterRestartFailover extends AbstractTestRestartCluster { 055 056 @ClassRule 057 public static final HBaseClassTestRule CLASS_RULE = 058 HBaseClassTestRule.forClass(TestClusterRestartFailover.class); 059 060 private static final Logger LOG = LoggerFactory.getLogger(TestClusterRestartFailover.class); 061 062 private static CountDownLatch SCP_LATCH; 063 private static ServerName SERVER_FOR_TEST; 064 065 @Override 066 protected boolean splitWALCoordinatedByZk() { 067 return true; 068 } 069 070 private ServerStateNode getServerStateNode(ServerName serverName) { 071 return UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates() 072 .getServerNode(serverName); 073 } 074 075 /** 076 * Test for HBASE-22964 077 */ 078 @Test 079 public void test() throws Exception { 080 setupCluster(); 081 setupTable(); 082 083 // Find the server which not carry hbase:namespace 084 for (JVMClusterUtil.RegionServerThread thread : UTIL.getHBaseCluster() 085 .getRegionServerThreads()) { 086 if (!thread.getRegionServer().getOnlineTables().contains(TableName.NAMESPACE_TABLE_NAME)) { 087 SERVER_FOR_TEST = thread.getRegionServer().getServerName(); 088 break; 089 } 090 } 091 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null); 092 ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST); 093 assertNotNull(serverNode); 094 assertTrue("serverNode should be ONLINE when cluster runs normally", 095 serverNode.isInState(ServerState.ONLINE)); 096 097 SCP_LATCH = new CountDownLatch(1); 098 099 // Shutdown cluster and restart 100 List<Integer> ports = 101 UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream() 102 .map(serverName -> serverName.getPort()).collect(Collectors.toList()); 103 LOG.info("Shutting down cluster"); 104 UTIL.getHBaseCluster().killAll(); 105 UTIL.getHBaseCluster().waitUntilShutDown(); 106 LOG.info("Restarting cluster"); 107 UTIL.restartHBaseCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class) 108 .numMasters(1).numRegionServers(3).rsPorts(ports).build()); 109 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized()); 110 111 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null); 112 serverNode = getServerStateNode(SERVER_FOR_TEST); 113 assertFalse("serverNode should not be ONLINE during SCP processing", 114 serverNode.isInState(ServerState.ONLINE)); 115 Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream() 116 .filter(p -> (p instanceof ServerCrashProcedure) && 117 ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)).findAny(); 118 assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent()); 119 assertTrue("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", 120 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) == 121 Procedure.NO_PROC_ID); 122 123 // Wait the SCP to finish 124 SCP_LATCH.countDown(); 125 UTIL.waitFor(60000, () -> procedure.get().isFinished()); 126 127 assertFalse("Even when the SCP is finished, the duplicate SCP should not be scheduled for " + 128 SERVER_FOR_TEST, 129 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) == 130 Procedure.NO_PROC_ID); 131 serverNode = UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates() 132 .getServerNode(SERVER_FOR_TEST); 133 assertNull("serverNode should be deleted after SCP finished", serverNode); 134 } 135 136 private void setupCluster() throws Exception { 137 UTIL.startMiniCluster( 138 StartMiniClusterOption.builder().masterClass(HMasterForTest.class).numMasters(1) 139 .numRegionServers(3).build()); 140 UTIL.waitFor(60000, () -> UTIL.getMiniHBaseCluster().getMaster().isInitialized()); 141 // wait for all SCPs finished 142 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream() 143 .noneMatch(p -> p instanceof ServerCrashProcedure)); 144 UTIL.getHBaseCluster().getMaster().balanceSwitch(false); 145 } 146 147 private void setupTable() throws Exception { 148 TableName tableName = TABLES[0]; 149 UTIL.createMultiRegionTable(tableName, FAMILY); 150 UTIL.waitTableAvailable(tableName); 151 Table table = UTIL.getConnection().getTable(tableName); 152 for (int i = 0; i < 100; i++) { 153 UTIL.loadTable(table, FAMILY); 154 } 155 } 156 157 public static final class HMasterForTest extends HMaster { 158 159 public HMasterForTest(Configuration conf) throws IOException, KeeperException { 160 super(conf); 161 } 162 163 @Override 164 protected AssignmentManager createAssignmentManager(MasterServices master) { 165 return new AssignmentManagerForTest(master); 166 } 167 } 168 169 private static final class AssignmentManagerForTest extends AssignmentManager { 170 171 public AssignmentManagerForTest(MasterServices master) { 172 super(master); 173 } 174 175 @Override 176 public List<RegionInfo> getRegionsOnServer(ServerName serverName) { 177 List<RegionInfo> regions = super.getRegionsOnServer(serverName); 178 // ServerCrashProcedure will call this method, so wait the CountDownLatch here 179 if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName.equals(SERVER_FOR_TEST)) { 180 try { 181 LOG.info("ServerCrashProcedure wait the CountDownLatch here"); 182 SCP_LATCH.await(); 183 LOG.info("Continue the ServerCrashProcedure"); 184 SCP_LATCH = null; 185 } catch (InterruptedException e) { 186 throw new RuntimeException(e); 187 } 188 } 189 return regions; 190 } 191 } 192}