001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.jupiter.api.Assertions.assertEquals; 021import static org.junit.jupiter.api.Assertions.assertNotNull; 022import static org.junit.jupiter.api.Assertions.assertTrue; 023 024import java.util.List; 025import java.util.concurrent.TimeUnit; 026import org.apache.hadoop.hbase.ClusterMetrics; 027import org.apache.hadoop.hbase.HBaseTestingUtil; 028import org.apache.hadoop.hbase.ServerName; 029import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 030import org.apache.hadoop.hbase.StartTestingClusterOption; 031import org.apache.hadoop.hbase.master.RegionState.State; 032import org.apache.hadoop.hbase.regionserver.HRegionServer; 033import org.apache.hadoop.hbase.testclassification.FlakeyTests; 034import org.apache.hadoop.hbase.testclassification.LargeTests; 035import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 036import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; 037import org.junit.jupiter.api.BeforeEach; 038import org.junit.jupiter.api.Tag; 039import org.junit.jupiter.api.Test; 040import org.junit.jupiter.api.TestInfo; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043 044@Tag(FlakeyTests.TAG) 045@Tag(LargeTests.TAG) 046public class TestMasterFailover { 047 048 private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class); 049 private String testMethodName; 050 051 @BeforeEach 052 public void setTestMethod(TestInfo testInfo) { 053 testMethodName = testInfo.getTestMethod().get().getName(); 054 } 055 056 /** 057 * Simple test of master failover. 058 * <p> 059 * Starts with three masters. Kills a backup master. Then kills the active master. Ensures the 060 * final master becomes active and we can still contact the cluster. 061 */ 062 @Test 063 public void testSimpleMasterFailover() throws Exception { 064 final int NUM_MASTERS = 3; 065 final int NUM_RS = 3; 066 067 // Start the cluster 068 HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 069 try { 070 StartTestingClusterOption option = StartTestingClusterOption.builder().numMasters(NUM_MASTERS) 071 .numRegionServers(NUM_RS).numDataNodes(NUM_RS).build(); 072 TEST_UTIL.startMiniCluster(option); 073 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 074 075 // get all the master threads 076 List<MasterThread> masterThreads = cluster.getMasterThreads(); 077 078 // wait for each to come online 079 for (MasterThread mt : masterThreads) { 080 assertTrue(mt.isAlive()); 081 } 082 083 // verify only one is the active master and we have right number 084 int numActive = 0; 085 int activeIndex = -1; 086 ServerName activeName = null; 087 HMaster active = null; 088 for (int i = 0; i < masterThreads.size(); i++) { 089 if (masterThreads.get(i).getMaster().isActiveMaster()) { 090 numActive++; 091 activeIndex = i; 092 active = masterThreads.get(activeIndex).getMaster(); 093 activeName = active.getServerName(); 094 } 095 } 096 assertEquals(1, numActive); 097 assertEquals(NUM_MASTERS, masterThreads.size()); 098 LOG.info("Active master " + activeName); 099 100 // Check that ClusterStatus reports the correct active and backup masters 101 assertNotNull(active); 102 ClusterMetrics status = active.getClusterMetrics(); 103 assertEquals(activeName, status.getMasterName()); 104 assertEquals(2, status.getBackupMasterNames().size()); 105 106 // attempt to stop one of the inactive masters 107 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1); 108 HMaster master = cluster.getMaster(backupIndex); 109 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n"); 110 cluster.stopMaster(backupIndex, false); 111 cluster.waitOnMaster(backupIndex); 112 113 // Verify still one active master and it's the same 114 for (int i = 0; i < masterThreads.size(); i++) { 115 if (masterThreads.get(i).getMaster().isActiveMaster()) { 116 assertEquals(activeName, masterThreads.get(i).getMaster().getServerName()); 117 activeIndex = i; 118 active = masterThreads.get(activeIndex).getMaster(); 119 } 120 } 121 assertEquals(1, numActive); 122 assertEquals(2, masterThreads.size()); 123 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics() 124 .getLiveServerMetrics().size(); 125 LOG.info( 126 "Active master " + active.getServerName() + " managing " + rsCount + " regions servers"); 127 assertEquals(3, rsCount); 128 129 // wait for the active master to acknowledge loss of the backup from ZK 130 final HMaster activeFinal = active; 131 TEST_UTIL.waitFor(TimeUnit.MINUTES.toMillis(5), 132 () -> activeFinal.getBackupMasters().size() == 1); 133 134 // Check that ClusterStatus reports the correct active and backup masters 135 assertNotNull(active); 136 status = active.getClusterMetrics(); 137 assertEquals(activeName, status.getMasterName()); 138 assertEquals(1, status.getBackupMasterNames().size()); 139 140 // kill the active master 141 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n"); 142 cluster.stopMaster(activeIndex, false); 143 cluster.waitOnMaster(activeIndex); 144 145 // wait for an active master to show up and be ready 146 assertTrue(cluster.waitForActiveAndReadyMaster()); 147 148 LOG.debug("\n\nVerifying backup master is now active\n"); 149 // should only have one master now 150 assertEquals(1, masterThreads.size()); 151 152 // and he should be active 153 active = masterThreads.get(0).getMaster(); 154 assertNotNull(active); 155 status = active.getClusterMetrics(); 156 ServerName masterName = status.getMasterName(); 157 assertNotNull(masterName); 158 assertEquals(active.getServerName(), masterName); 159 assertTrue(active.isActiveMaster()); 160 assertEquals(0, status.getBackupMasterNames().size()); 161 int rss = status.getLiveServerMetrics().size(); 162 LOG.info("Active master {} managing {} region servers", masterName.getServerName(), rss); 163 assertEquals(3, rss); 164 } finally { 165 // Stop the cluster 166 TEST_UTIL.shutdownMiniCluster(); 167 } 168 } 169 170 /** 171 * Test meta in transition when master failover. This test used to manipulate region state up in 172 * zk. That is not allowed any more in hbase2 so I removed that messing. That makes this test 173 * anemic. 174 */ 175 @Test 176 public void testMetaInTransitionWhenMasterFailover() throws Exception { 177 // Start the cluster 178 HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 179 TEST_UTIL.startMiniCluster(); 180 try { 181 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 182 LOG.info("Cluster started"); 183 184 HMaster activeMaster = cluster.getMaster(); 185 ServerName metaServerName = cluster.getServerHoldingMeta(); 186 HRegionServer hrs = cluster.getRegionServer(metaServerName); 187 188 // Now kill master, meta should remain on rs, where we placed it before. 189 LOG.info("Aborting master"); 190 activeMaster.abort("test-kill"); 191 cluster.waitForMasterToStop(activeMaster.getServerName(), 30000); 192 LOG.info("Master has aborted"); 193 194 // meta should remain where it was 195 RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper()); 196 assertEquals(metaServerName, metaState.getServerName(), "hbase:meta should be online on RS"); 197 assertEquals(metaState.getState(), State.OPEN, "hbase:meta should be online on RS"); 198 199 // Start up a new master 200 LOG.info("Starting up a new master"); 201 activeMaster = cluster.startMaster().getMaster(); 202 LOG.info("Waiting for master to be ready"); 203 cluster.waitForActiveAndReadyMaster(); 204 LOG.info("Master is ready"); 205 206 // ensure meta is still deployed on RS 207 metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper()); 208 assertEquals(metaServerName, metaState.getServerName(), "hbase:meta should be online on RS"); 209 assertEquals(metaState.getState(), State.OPEN, "hbase:meta should be online on RS"); 210 211 // Done, shutdown the cluster 212 } finally { 213 TEST_UTIL.shutdownMiniCluster(); 214 } 215 } 216}