001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertNotNull; 022import static org.junit.Assert.assertTrue; 023 024import java.util.List; 025import org.apache.hadoop.hbase.ClusterMetrics; 026import org.apache.hadoop.hbase.HBaseClassTestRule; 027import org.apache.hadoop.hbase.HBaseTestingUtility; 028import org.apache.hadoop.hbase.MiniHBaseCluster; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.master.RegionState.State; 031import org.apache.hadoop.hbase.regionserver.HRegionServer; 032import org.apache.hadoop.hbase.testclassification.FlakeyTests; 033import org.apache.hadoop.hbase.testclassification.LargeTests; 034import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 035import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; 036import org.junit.ClassRule; 037import org.junit.Rule; 038import org.junit.Test; 039import org.junit.experimental.categories.Category; 040import org.junit.rules.TestName; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043 044@Category({FlakeyTests.class, LargeTests.class}) 045public class TestMasterFailover { 046 047 @ClassRule 048 public static final HBaseClassTestRule CLASS_RULE = 049 HBaseClassTestRule.forClass(TestMasterFailover.class); 050 051 private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class); 052 @Rule public TestName name = new TestName(); 053 054 /** 055 * Simple test of master failover. 056 * <p> 057 * Starts with three masters. Kills a backup master. Then kills the active 058 * master. Ensures the final master becomes active and we can still contact 059 * the cluster. 060 */ 061 @Test 062 public void testSimpleMasterFailover() throws Exception { 063 final int NUM_MASTERS = 3; 064 final int NUM_RS = 3; 065 066 // Start the cluster 067 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 068 try { 069 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS); 070 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 071 072 // get all the master threads 073 List<MasterThread> masterThreads = cluster.getMasterThreads(); 074 075 // wait for each to come online 076 for (MasterThread mt : masterThreads) { 077 assertTrue(mt.isAlive()); 078 } 079 080 // verify only one is the active master and we have right number 081 int numActive = 0; 082 int activeIndex = -1; 083 ServerName activeName = null; 084 HMaster active = null; 085 for (int i = 0; i < masterThreads.size(); i++) { 086 if (masterThreads.get(i).getMaster().isActiveMaster()) { 087 numActive++; 088 activeIndex = i; 089 active = masterThreads.get(activeIndex).getMaster(); 090 activeName = active.getServerName(); 091 } 092 } 093 assertEquals(1, numActive); 094 assertEquals(NUM_MASTERS, masterThreads.size()); 095 LOG.info("Active master " + activeName); 096 097 // Check that ClusterStatus reports the correct active and backup masters 098 assertNotNull(active); 099 ClusterMetrics status = active.getClusterMetrics(); 100 assertTrue(status.getMasterName().equals(activeName)); 101 assertEquals(2, status.getBackupMasterNames().size()); 102 103 // attempt to stop one of the inactive masters 104 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1); 105 HMaster master = cluster.getMaster(backupIndex); 106 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n"); 107 cluster.stopMaster(backupIndex, false); 108 cluster.waitOnMaster(backupIndex); 109 110 // Verify still one active master and it's the same 111 for (int i = 0; i < masterThreads.size(); i++) { 112 if (masterThreads.get(i).getMaster().isActiveMaster()) { 113 assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName())); 114 activeIndex = i; 115 active = masterThreads.get(activeIndex).getMaster(); 116 } 117 } 118 assertEquals(1, numActive); 119 assertEquals(2, masterThreads.size()); 120 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics() 121 .getLiveServerMetrics().size(); 122 LOG.info("Active master " + active.getServerName() + " managing " + rsCount + 123 " regions servers"); 124 assertEquals(3, rsCount); 125 126 // Check that ClusterStatus reports the correct active and backup masters 127 assertNotNull(active); 128 status = active.getClusterMetrics(); 129 assertTrue(status.getMasterName().equals(activeName)); 130 assertEquals(1, status.getBackupMasterNames().size()); 131 132 // kill the active master 133 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n"); 134 cluster.stopMaster(activeIndex, false); 135 cluster.waitOnMaster(activeIndex); 136 137 // wait for an active master to show up and be ready 138 assertTrue(cluster.waitForActiveAndReadyMaster()); 139 140 LOG.debug("\n\nVerifying backup master is now active\n"); 141 // should only have one master now 142 assertEquals(1, masterThreads.size()); 143 144 // and he should be active 145 active = masterThreads.get(0).getMaster(); 146 assertNotNull(active); 147 status = active.getClusterMetrics(); 148 ServerName mastername = status.getMasterName(); 149 assertTrue(mastername.equals(active.getServerName())); 150 assertTrue(active.isActiveMaster()); 151 assertEquals(0, status.getBackupMasterNames().size()); 152 int rss = status.getLiveServerMetrics().size(); 153 LOG.info("Active master " + mastername.getServerName() + " managing " + 154 rss + " region servers"); 155 assertEquals(3, rss); 156 } finally { 157 // Stop the cluster 158 TEST_UTIL.shutdownMiniCluster(); 159 } 160 } 161 162 /** 163 * Test meta in transition when master failover. 164 * This test used to manipulate region state up in zk. That is not allowed any more in hbase2 165 * so I removed that messing. That makes this test anemic. 166 */ 167 @Test 168 public void testMetaInTransitionWhenMasterFailover() throws Exception { 169 final int NUM_MASTERS = 1; 170 final int NUM_RS = 1; 171 172 // Start the cluster 173 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 174 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS); 175 try { 176 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 177 LOG.info("Cluster started"); 178 179 HMaster activeMaster = cluster.getMaster(); 180 ServerName metaServerName = cluster.getServerHoldingMeta(); 181 HRegionServer hrs = cluster.getRegionServer(metaServerName); 182 183 // Now kill master, meta should remain on rs, where we placed it before. 184 LOG.info("Aborting master"); 185 activeMaster.abort("test-kill"); 186 cluster.waitForMasterToStop(activeMaster.getServerName(), 30000); 187 LOG.info("Master has aborted"); 188 189 // meta should remain where it was 190 RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper()); 191 assertEquals("hbase:meta should be online on RS", 192 metaState.getServerName(), metaServerName); 193 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 194 195 // Start up a new master 196 LOG.info("Starting up a new master"); 197 activeMaster = cluster.startMaster().getMaster(); 198 LOG.info("Waiting for master to be ready"); 199 cluster.waitForActiveAndReadyMaster(); 200 LOG.info("Master is ready"); 201 202 // ensure meta is still deployed on RS 203 metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper()); 204 assertEquals("hbase:meta should be online on RS", 205 metaState.getServerName(), metaServerName); 206 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 207 208 // Done, shutdown the cluster 209 } finally { 210 TEST_UTIL.shutdownMiniCluster(); 211 } 212 } 213} 214