001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertNotNull; 022import static org.junit.Assert.assertTrue; 023 024import java.util.List; 025import org.apache.hadoop.hbase.ClusterMetrics; 026import org.apache.hadoop.hbase.HBaseClassTestRule; 027import org.apache.hadoop.hbase.HBaseTestingUtility; 028import org.apache.hadoop.hbase.MiniHBaseCluster; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.StartMiniClusterOption; 031import org.apache.hadoop.hbase.master.RegionState.State; 032import org.apache.hadoop.hbase.regionserver.HRegionServer; 033import org.apache.hadoop.hbase.testclassification.FlakeyTests; 034import org.apache.hadoop.hbase.testclassification.LargeTests; 035import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 036import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; 037import org.junit.ClassRule; 038import org.junit.Rule; 039import org.junit.Test; 040import org.junit.experimental.categories.Category; 041import org.junit.rules.TestName; 042import org.slf4j.Logger; 043import org.slf4j.LoggerFactory; 044 045@Category({FlakeyTests.class, LargeTests.class}) 046public class TestMasterFailover { 047 048 @ClassRule 049 public static final HBaseClassTestRule CLASS_RULE = 050 HBaseClassTestRule.forClass(TestMasterFailover.class); 051 052 private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class); 053 @Rule public TestName name = new TestName(); 054 055 /** 056 * Simple test of master failover. 057 * <p> 058 * Starts with three masters. Kills a backup master. Then kills the active 059 * master. Ensures the final master becomes active and we can still contact 060 * the cluster. 061 */ 062 @Test 063 public void testSimpleMasterFailover() throws Exception { 064 final int NUM_MASTERS = 3; 065 final int NUM_RS = 3; 066 067 // Start the cluster 068 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 069 try { 070 StartMiniClusterOption option = StartMiniClusterOption.builder() 071 .numMasters(NUM_MASTERS).numRegionServers(NUM_RS).numDataNodes(NUM_RS).build(); 072 TEST_UTIL.startMiniCluster(option); 073 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 074 075 // get all the master threads 076 List<MasterThread> masterThreads = cluster.getMasterThreads(); 077 078 // wait for each to come online 079 for (MasterThread mt : masterThreads) { 080 assertTrue(mt.isAlive()); 081 } 082 083 // verify only one is the active master and we have right number 084 int numActive = 0; 085 int activeIndex = -1; 086 ServerName activeName = null; 087 HMaster active = null; 088 for (int i = 0; i < masterThreads.size(); i++) { 089 if (masterThreads.get(i).getMaster().isActiveMaster()) { 090 numActive++; 091 activeIndex = i; 092 active = masterThreads.get(activeIndex).getMaster(); 093 activeName = active.getServerName(); 094 } 095 } 096 assertEquals(1, numActive); 097 assertEquals(NUM_MASTERS, masterThreads.size()); 098 LOG.info("Active master " + activeName); 099 100 // Check that ClusterStatus reports the correct active and backup masters 101 assertNotNull(active); 102 ClusterMetrics status = active.getClusterMetrics(); 103 assertTrue(status.getMasterName().equals(activeName)); 104 assertEquals(2, status.getBackupMasterNames().size()); 105 106 // attempt to stop one of the inactive masters 107 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1); 108 HMaster master = cluster.getMaster(backupIndex); 109 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n"); 110 cluster.stopMaster(backupIndex, false); 111 cluster.waitOnMaster(backupIndex); 112 113 // Verify still one active master and it's the same 114 for (int i = 0; i < masterThreads.size(); i++) { 115 if (masterThreads.get(i).getMaster().isActiveMaster()) { 116 assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName())); 117 activeIndex = i; 118 active = masterThreads.get(activeIndex).getMaster(); 119 } 120 } 121 assertEquals(1, numActive); 122 assertEquals(2, masterThreads.size()); 123 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics() 124 .getLiveServerMetrics().size(); 125 LOG.info("Active master " + active.getServerName() + " managing " + rsCount + 126 " regions servers"); 127 assertEquals(3, rsCount); 128 129 // Check that ClusterStatus reports the correct active and backup masters 130 assertNotNull(active); 131 status = active.getClusterMetrics(); 132 assertTrue(status.getMasterName().equals(activeName)); 133 assertEquals(1, status.getBackupMasterNames().size()); 134 135 // kill the active master 136 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n"); 137 cluster.stopMaster(activeIndex, false); 138 cluster.waitOnMaster(activeIndex); 139 140 // wait for an active master to show up and be ready 141 assertTrue(cluster.waitForActiveAndReadyMaster()); 142 143 LOG.debug("\n\nVerifying backup master is now active\n"); 144 // should only have one master now 145 assertEquals(1, masterThreads.size()); 146 147 // and he should be active 148 active = masterThreads.get(0).getMaster(); 149 assertNotNull(active); 150 status = active.getClusterMetrics(); 151 ServerName mastername = status.getMasterName(); 152 assertTrue(mastername.equals(active.getServerName())); 153 assertTrue(active.isActiveMaster()); 154 assertEquals(0, status.getBackupMasterNames().size()); 155 int rss = status.getLiveServerMetrics().size(); 156 LOG.info("Active master " + mastername.getServerName() + " managing " + 157 rss + " region servers"); 158 assertEquals(3, rss); 159 } finally { 160 // Stop the cluster 161 TEST_UTIL.shutdownMiniCluster(); 162 } 163 } 164 165 /** 166 * Test meta in transition when master failover. 167 * This test used to manipulate region state up in zk. That is not allowed any more in hbase2 168 * so I removed that messing. That makes this test anemic. 169 */ 170 @Test 171 public void testMetaInTransitionWhenMasterFailover() throws Exception { 172 // Start the cluster 173 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 174 TEST_UTIL.startMiniCluster(); 175 try { 176 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 177 LOG.info("Cluster started"); 178 179 HMaster activeMaster = cluster.getMaster(); 180 ServerName metaServerName = cluster.getServerHoldingMeta(); 181 HRegionServer hrs = cluster.getRegionServer(metaServerName); 182 183 // Now kill master, meta should remain on rs, where we placed it before. 184 LOG.info("Aborting master"); 185 activeMaster.abort("test-kill"); 186 cluster.waitForMasterToStop(activeMaster.getServerName(), 30000); 187 LOG.info("Master has aborted"); 188 189 // meta should remain where it was 190 RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper()); 191 assertEquals("hbase:meta should be online on RS", 192 metaState.getServerName(), metaServerName); 193 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 194 195 // Start up a new master 196 LOG.info("Starting up a new master"); 197 activeMaster = cluster.startMaster().getMaster(); 198 LOG.info("Waiting for master to be ready"); 199 cluster.waitForActiveAndReadyMaster(); 200 LOG.info("Master is ready"); 201 202 // ensure meta is still deployed on RS 203 metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper()); 204 assertEquals("hbase:meta should be online on RS", 205 metaState.getServerName(), metaServerName); 206 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 207 208 // Done, shutdown the cluster 209 } finally { 210 TEST_UTIL.shutdownMiniCluster(); 211 } 212 } 213} 214