001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertNotNull; 022import static org.junit.Assert.assertTrue; 023import java.util.List; 024import java.util.concurrent.TimeUnit; 025import org.apache.hadoop.hbase.ClusterMetrics; 026import org.apache.hadoop.hbase.HBaseClassTestRule; 027import org.apache.hadoop.hbase.HBaseTestingUtility; 028import org.apache.hadoop.hbase.MiniHBaseCluster; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.StartMiniClusterOption; 031import org.apache.hadoop.hbase.master.RegionState.State; 032import org.apache.hadoop.hbase.regionserver.HRegionServer; 033import org.apache.hadoop.hbase.testclassification.FlakeyTests; 034import org.apache.hadoop.hbase.testclassification.LargeTests; 035import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 036import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; 037import org.junit.ClassRule; 038import org.junit.Rule; 039import org.junit.Test; 040import org.junit.experimental.categories.Category; 041import org.junit.rules.TestName; 042import org.slf4j.Logger; 043import org.slf4j.LoggerFactory; 044 045@Category({FlakeyTests.class, LargeTests.class}) 046public class TestMasterFailover { 047 048 @ClassRule 049 public static final HBaseClassTestRule CLASS_RULE = 050 HBaseClassTestRule.forClass(TestMasterFailover.class); 051 052 private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class); 053 @Rule public TestName name = new TestName(); 054 055 /** 056 * Simple test of master failover. 057 * <p> 058 * Starts with three masters. Kills a backup master. Then kills the active 059 * master. Ensures the final master becomes active and we can still contact 060 * the cluster. 061 */ 062 @Test 063 public void testSimpleMasterFailover() throws Exception { 064 final int NUM_MASTERS = 3; 065 final int NUM_RS = 3; 066 067 // Start the cluster 068 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 069 try { 070 StartMiniClusterOption option = StartMiniClusterOption.builder() 071 .numMasters(NUM_MASTERS).numRegionServers(NUM_RS).numDataNodes(NUM_RS).build(); 072 TEST_UTIL.startMiniCluster(option); 073 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 074 075 // get all the master threads 076 List<MasterThread> masterThreads = cluster.getMasterThreads(); 077 078 // wait for each to come online 079 for (MasterThread mt : masterThreads) { 080 assertTrue(mt.isAlive()); 081 } 082 083 // verify only one is the active master and we have right number 084 int numActive = 0; 085 int activeIndex = -1; 086 ServerName activeName = null; 087 HMaster active = null; 088 for (int i = 0; i < masterThreads.size(); i++) { 089 if (masterThreads.get(i).getMaster().isActiveMaster()) { 090 numActive++; 091 activeIndex = i; 092 active = masterThreads.get(activeIndex).getMaster(); 093 activeName = active.getServerName(); 094 } 095 } 096 assertEquals(1, numActive); 097 assertEquals(NUM_MASTERS, masterThreads.size()); 098 LOG.info("Active master " + activeName); 099 100 // Check that ClusterStatus reports the correct active and backup masters 101 assertNotNull(active); 102 ClusterMetrics status = active.getClusterMetrics(); 103 assertEquals(activeName, status.getMasterName()); 104 assertEquals(2, status.getBackupMasterNames().size()); 105 106 // attempt to stop one of the inactive masters 107 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1); 108 HMaster master = cluster.getMaster(backupIndex); 109 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n"); 110 cluster.stopMaster(backupIndex, false); 111 cluster.waitOnMaster(backupIndex); 112 113 // Verify still one active master and it's the same 114 for (int i = 0; i < masterThreads.size(); i++) { 115 if (masterThreads.get(i).getMaster().isActiveMaster()) { 116 assertEquals(activeName, masterThreads.get(i).getMaster().getServerName()); 117 activeIndex = i; 118 active = masterThreads.get(activeIndex).getMaster(); 119 } 120 } 121 assertEquals(1, numActive); 122 assertEquals(2, masterThreads.size()); 123 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics() 124 .getLiveServerMetrics().size(); 125 LOG.info("Active master " + active.getServerName() + " managing " + rsCount + 126 " regions servers"); 127 assertEquals(3, rsCount); 128 129 // wait for the active master to acknowledge loss of the backup from ZK 130 final HMaster activeFinal = active; 131 TEST_UTIL.waitFor( 132 TimeUnit.SECONDS.toMillis(30), () -> activeFinal.getBackupMasters().size() == 1); 133 134 // Check that ClusterStatus reports the correct active and backup masters 135 assertNotNull(active); 136 status = active.getClusterMetrics(); 137 assertEquals(activeName, status.getMasterName()); 138 assertEquals(1, status.getBackupMasterNames().size()); 139 140 // kill the active master 141 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n"); 142 cluster.stopMaster(activeIndex, false); 143 cluster.waitOnMaster(activeIndex); 144 145 // wait for an active master to show up and be ready 146 assertTrue(cluster.waitForActiveAndReadyMaster()); 147 148 LOG.debug("\n\nVerifying backup master is now active\n"); 149 // should only have one master now 150 assertEquals(1, masterThreads.size()); 151 152 // and he should be active 153 active = masterThreads.get(0).getMaster(); 154 assertNotNull(active); 155 status = active.getClusterMetrics(); 156 ServerName masterName = status.getMasterName(); 157 assertNotNull(masterName); 158 assertEquals(active.getServerName(), masterName); 159 assertTrue(active.isActiveMaster()); 160 assertEquals(0, status.getBackupMasterNames().size()); 161 int rss = status.getLiveServerMetrics().size(); 162 LOG.info("Active master {} managing {} region servers", masterName.getServerName(), rss); 163 assertEquals(3, rss); 164 } finally { 165 // Stop the cluster 166 TEST_UTIL.shutdownMiniCluster(); 167 } 168 } 169 170 /** 171 * Test meta in transition when master failover. 172 * This test used to manipulate region state up in zk. That is not allowed any more in hbase2 173 * so I removed that messing. That makes this test anemic. 174 */ 175 @Test 176 public void testMetaInTransitionWhenMasterFailover() throws Exception { 177 // Start the cluster 178 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 179 TEST_UTIL.startMiniCluster(); 180 try { 181 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 182 LOG.info("Cluster started"); 183 184 HMaster activeMaster = cluster.getMaster(); 185 ServerName metaServerName = cluster.getServerHoldingMeta(); 186 HRegionServer hrs = cluster.getRegionServer(metaServerName); 187 188 // Now kill master, meta should remain on rs, where we placed it before. 189 LOG.info("Aborting master"); 190 activeMaster.abort("test-kill"); 191 cluster.waitForMasterToStop(activeMaster.getServerName(), 30000); 192 LOG.info("Master has aborted"); 193 194 // meta should remain where it was 195 RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper()); 196 assertEquals("hbase:meta should be online on RS", 197 metaState.getServerName(), metaServerName); 198 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 199 200 // Start up a new master 201 LOG.info("Starting up a new master"); 202 activeMaster = cluster.startMaster().getMaster(); 203 LOG.info("Waiting for master to be ready"); 204 cluster.waitForActiveAndReadyMaster(); 205 LOG.info("Master is ready"); 206 207 // ensure meta is still deployed on RS 208 metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper()); 209 assertEquals("hbase:meta should be online on RS", 210 metaState.getServerName(), metaServerName); 211 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 212 213 // Done, shutdown the cluster 214 } finally { 215 TEST_UTIL.shutdownMiniCluster(); 216 } 217 } 218} 219