001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertNotNull;
022import static org.junit.Assert.assertTrue;
023
024import java.util.List;
025import org.apache.hadoop.hbase.ClusterMetrics;
026import org.apache.hadoop.hbase.HBaseClassTestRule;
027import org.apache.hadoop.hbase.HBaseTestingUtility;
028import org.apache.hadoop.hbase.MiniHBaseCluster;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.master.RegionState.State;
031import org.apache.hadoop.hbase.regionserver.HRegionServer;
032import org.apache.hadoop.hbase.testclassification.FlakeyTests;
033import org.apache.hadoop.hbase.testclassification.LargeTests;
034import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
035import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
036import org.junit.ClassRule;
037import org.junit.Rule;
038import org.junit.Test;
039import org.junit.experimental.categories.Category;
040import org.junit.rules.TestName;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043
044@Category({FlakeyTests.class, LargeTests.class})
045public class TestMasterFailover {
046
047  @ClassRule
048  public static final HBaseClassTestRule CLASS_RULE =
049      HBaseClassTestRule.forClass(TestMasterFailover.class);
050
051  private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class);
052  @Rule public TestName name = new TestName();
053
054  /**
055   * Simple test of master failover.
056   * <p>
057   * Starts with three masters.  Kills a backup master.  Then kills the active
058   * master.  Ensures the final master becomes active and we can still contact
059   * the cluster.
060   */
061  @Test
062  public void testSimpleMasterFailover() throws Exception {
063    final int NUM_MASTERS = 3;
064    final int NUM_RS = 3;
065
066    // Start the cluster
067    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
068    try {
069      TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
070      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
071
072      // get all the master threads
073      List<MasterThread> masterThreads = cluster.getMasterThreads();
074
075      // wait for each to come online
076      for (MasterThread mt : masterThreads) {
077        assertTrue(mt.isAlive());
078      }
079
080      // verify only one is the active master and we have right number
081      int numActive = 0;
082      int activeIndex = -1;
083      ServerName activeName = null;
084      HMaster active = null;
085      for (int i = 0; i < masterThreads.size(); i++) {
086        if (masterThreads.get(i).getMaster().isActiveMaster()) {
087          numActive++;
088          activeIndex = i;
089          active = masterThreads.get(activeIndex).getMaster();
090          activeName = active.getServerName();
091        }
092      }
093      assertEquals(1, numActive);
094      assertEquals(NUM_MASTERS, masterThreads.size());
095      LOG.info("Active master " + activeName);
096
097      // Check that ClusterStatus reports the correct active and backup masters
098      assertNotNull(active);
099      ClusterMetrics status = active.getClusterMetrics();
100      assertTrue(status.getMasterName().equals(activeName));
101      assertEquals(2, status.getBackupMasterNames().size());
102
103      // attempt to stop one of the inactive masters
104      int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
105      HMaster master = cluster.getMaster(backupIndex);
106      LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
107      cluster.stopMaster(backupIndex, false);
108      cluster.waitOnMaster(backupIndex);
109
110      // Verify still one active master and it's the same
111      for (int i = 0; i < masterThreads.size(); i++) {
112        if (masterThreads.get(i).getMaster().isActiveMaster()) {
113          assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
114          activeIndex = i;
115          active = masterThreads.get(activeIndex).getMaster();
116        }
117      }
118      assertEquals(1, numActive);
119      assertEquals(2, masterThreads.size());
120      int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics()
121        .getLiveServerMetrics().size();
122      LOG.info("Active master " + active.getServerName() + " managing " + rsCount +
123          " regions servers");
124      assertEquals(3, rsCount);
125
126      // Check that ClusterStatus reports the correct active and backup masters
127      assertNotNull(active);
128      status = active.getClusterMetrics();
129      assertTrue(status.getMasterName().equals(activeName));
130      assertEquals(1, status.getBackupMasterNames().size());
131
132      // kill the active master
133      LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
134      cluster.stopMaster(activeIndex, false);
135      cluster.waitOnMaster(activeIndex);
136
137      // wait for an active master to show up and be ready
138      assertTrue(cluster.waitForActiveAndReadyMaster());
139
140      LOG.debug("\n\nVerifying backup master is now active\n");
141      // should only have one master now
142      assertEquals(1, masterThreads.size());
143
144      // and he should be active
145      active = masterThreads.get(0).getMaster();
146      assertNotNull(active);
147      status = active.getClusterMetrics();
148      ServerName mastername = status.getMasterName();
149      assertTrue(mastername.equals(active.getServerName()));
150      assertTrue(active.isActiveMaster());
151      assertEquals(0, status.getBackupMasterNames().size());
152      int rss = status.getLiveServerMetrics().size();
153      LOG.info("Active master " + mastername.getServerName() + " managing " +
154          rss + " region servers");
155      assertEquals(3, rss);
156    } finally {
157      // Stop the cluster
158      TEST_UTIL.shutdownMiniCluster();
159    }
160  }
161
162  /**
163   * Test meta in transition when master failover.
164   * This test used to manipulate region state up in zk. That is not allowed any more in hbase2
165   * so I removed that messing. That makes this test anemic.
166   */
167  @Test
168  public void testMetaInTransitionWhenMasterFailover() throws Exception {
169    final int NUM_MASTERS = 1;
170    final int NUM_RS = 1;
171
172    // Start the cluster
173    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
174    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
175    try {
176      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
177      LOG.info("Cluster started");
178
179      HMaster activeMaster = cluster.getMaster();
180      ServerName metaServerName = cluster.getServerHoldingMeta();
181      HRegionServer hrs = cluster.getRegionServer(metaServerName);
182
183      // Now kill master, meta should remain on rs, where we placed it before.
184      LOG.info("Aborting master");
185      activeMaster.abort("test-kill");
186      cluster.waitForMasterToStop(activeMaster.getServerName(), 30000);
187      LOG.info("Master has aborted");
188
189      // meta should remain where it was
190      RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper());
191      assertEquals("hbase:meta should be online on RS",
192          metaState.getServerName(), metaServerName);
193      assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
194
195      // Start up a new master
196      LOG.info("Starting up a new master");
197      activeMaster = cluster.startMaster().getMaster();
198      LOG.info("Waiting for master to be ready");
199      cluster.waitForActiveAndReadyMaster();
200      LOG.info("Master is ready");
201
202      // ensure meta is still deployed on RS
203      metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
204      assertEquals("hbase:meta should be online on RS",
205          metaState.getServerName(), metaServerName);
206      assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
207
208      // Done, shutdown the cluster
209    } finally {
210      TEST_UTIL.shutdownMiniCluster();
211    }
212  }
213}
214