001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertNotNull;
022import static org.junit.Assert.assertTrue;
023import java.util.List;
024import java.util.concurrent.TimeUnit;
025import org.apache.hadoop.hbase.ClusterMetrics;
026import org.apache.hadoop.hbase.HBaseClassTestRule;
027import org.apache.hadoop.hbase.HBaseTestingUtility;
028import org.apache.hadoop.hbase.MiniHBaseCluster;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.StartMiniClusterOption;
031import org.apache.hadoop.hbase.master.RegionState.State;
032import org.apache.hadoop.hbase.regionserver.HRegionServer;
033import org.apache.hadoop.hbase.testclassification.FlakeyTests;
034import org.apache.hadoop.hbase.testclassification.LargeTests;
035import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
036import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
037import org.junit.ClassRule;
038import org.junit.Rule;
039import org.junit.Test;
040import org.junit.experimental.categories.Category;
041import org.junit.rules.TestName;
042import org.slf4j.Logger;
043import org.slf4j.LoggerFactory;
044
045@Category({FlakeyTests.class, LargeTests.class})
046public class TestMasterFailover {
047
048  @ClassRule
049  public static final HBaseClassTestRule CLASS_RULE =
050      HBaseClassTestRule.forClass(TestMasterFailover.class);
051
052  private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class);
053  @Rule public TestName name = new TestName();
054
055  /**
056   * Simple test of master failover.
057   * <p>
058   * Starts with three masters.  Kills a backup master.  Then kills the active
059   * master.  Ensures the final master becomes active and we can still contact
060   * the cluster.
061   */
062  @Test
063  public void testSimpleMasterFailover() throws Exception {
064    final int NUM_MASTERS = 3;
065    final int NUM_RS = 3;
066
067    // Start the cluster
068    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
069    try {
070      StartMiniClusterOption option = StartMiniClusterOption.builder()
071          .numMasters(NUM_MASTERS).numRegionServers(NUM_RS).numDataNodes(NUM_RS).build();
072      TEST_UTIL.startMiniCluster(option);
073      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
074
075      // get all the master threads
076      List<MasterThread> masterThreads = cluster.getMasterThreads();
077
078      // wait for each to come online
079      for (MasterThread mt : masterThreads) {
080        assertTrue(mt.isAlive());
081      }
082
083      // verify only one is the active master and we have right number
084      int numActive = 0;
085      int activeIndex = -1;
086      ServerName activeName = null;
087      HMaster active = null;
088      for (int i = 0; i < masterThreads.size(); i++) {
089        if (masterThreads.get(i).getMaster().isActiveMaster()) {
090          numActive++;
091          activeIndex = i;
092          active = masterThreads.get(activeIndex).getMaster();
093          activeName = active.getServerName();
094        }
095      }
096      assertEquals(1, numActive);
097      assertEquals(NUM_MASTERS, masterThreads.size());
098      LOG.info("Active master " + activeName);
099
100      // Check that ClusterStatus reports the correct active and backup masters
101      assertNotNull(active);
102      ClusterMetrics status = active.getClusterMetrics();
103      assertEquals(activeName, status.getMasterName());
104      assertEquals(2, status.getBackupMasterNames().size());
105
106      // attempt to stop one of the inactive masters
107      int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
108      HMaster master = cluster.getMaster(backupIndex);
109      LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
110      cluster.stopMaster(backupIndex, false);
111      cluster.waitOnMaster(backupIndex);
112
113      // Verify still one active master and it's the same
114      for (int i = 0; i < masterThreads.size(); i++) {
115        if (masterThreads.get(i).getMaster().isActiveMaster()) {
116          assertEquals(activeName, masterThreads.get(i).getMaster().getServerName());
117          activeIndex = i;
118          active = masterThreads.get(activeIndex).getMaster();
119        }
120      }
121      assertEquals(1, numActive);
122      assertEquals(2, masterThreads.size());
123      int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics()
124        .getLiveServerMetrics().size();
125      LOG.info("Active master " + active.getServerName() + " managing " + rsCount +
126          " regions servers");
127      assertEquals(3, rsCount);
128
129      // wait for the active master to acknowledge loss of the backup from ZK
130      final HMaster activeFinal = active;
131      TEST_UTIL.waitFor(
132        TimeUnit.SECONDS.toMillis(30), () -> activeFinal.getBackupMasters().size() == 1);
133
134      // Check that ClusterStatus reports the correct active and backup masters
135      assertNotNull(active);
136      status = active.getClusterMetrics();
137      assertEquals(activeName, status.getMasterName());
138      assertEquals(1, status.getBackupMasterNames().size());
139
140      // kill the active master
141      LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
142      cluster.stopMaster(activeIndex, false);
143      cluster.waitOnMaster(activeIndex);
144
145      // wait for an active master to show up and be ready
146      assertTrue(cluster.waitForActiveAndReadyMaster());
147
148      LOG.debug("\n\nVerifying backup master is now active\n");
149      // should only have one master now
150      assertEquals(1, masterThreads.size());
151
152      // and he should be active
153      active = masterThreads.get(0).getMaster();
154      assertNotNull(active);
155      status = active.getClusterMetrics();
156      ServerName masterName = status.getMasterName();
157      assertNotNull(masterName);
158      assertEquals(active.getServerName(), masterName);
159      assertTrue(active.isActiveMaster());
160      assertEquals(0, status.getBackupMasterNames().size());
161      int rss = status.getLiveServerMetrics().size();
162      LOG.info("Active master {} managing {} region servers", masterName.getServerName(), rss);
163      assertEquals(3, rss);
164    } finally {
165      // Stop the cluster
166      TEST_UTIL.shutdownMiniCluster();
167    }
168  }
169
170  /**
171   * Test meta in transition when master failover.
172   * This test used to manipulate region state up in zk. That is not allowed any more in hbase2
173   * so I removed that messing. That makes this test anemic.
174   */
175  @Test
176  public void testMetaInTransitionWhenMasterFailover() throws Exception {
177    // Start the cluster
178    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
179    TEST_UTIL.startMiniCluster();
180    try {
181      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
182      LOG.info("Cluster started");
183
184      HMaster activeMaster = cluster.getMaster();
185      ServerName metaServerName = cluster.getServerHoldingMeta();
186      HRegionServer hrs = cluster.getRegionServer(metaServerName);
187
188      // Now kill master, meta should remain on rs, where we placed it before.
189      LOG.info("Aborting master");
190      activeMaster.abort("test-kill");
191      cluster.waitForMasterToStop(activeMaster.getServerName(), 30000);
192      LOG.info("Master has aborted");
193
194      // meta should remain where it was
195      RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper());
196      assertEquals("hbase:meta should be online on RS",
197          metaState.getServerName(), metaServerName);
198      assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
199
200      // Start up a new master
201      LOG.info("Starting up a new master");
202      activeMaster = cluster.startMaster().getMaster();
203      LOG.info("Waiting for master to be ready");
204      cluster.waitForActiveAndReadyMaster();
205      LOG.info("Master is ready");
206
207      // ensure meta is still deployed on RS
208      metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
209      assertEquals("hbase:meta should be online on RS",
210          metaState.getServerName(), metaServerName);
211      assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
212
213      // Done, shutdown the cluster
214    } finally {
215      TEST_UTIL.shutdownMiniCluster();
216    }
217  }
218}
219