001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertNotNull;
022import static org.junit.Assert.assertTrue;
023
024import java.util.List;
025import java.util.concurrent.TimeUnit;
026import org.apache.hadoop.hbase.ClusterMetrics;
027import org.apache.hadoop.hbase.HBaseClassTestRule;
028import org.apache.hadoop.hbase.HBaseTestingUtil;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
031import org.apache.hadoop.hbase.StartTestingClusterOption;
032import org.apache.hadoop.hbase.master.RegionState.State;
033import org.apache.hadoop.hbase.regionserver.HRegionServer;
034import org.apache.hadoop.hbase.testclassification.FlakeyTests;
035import org.apache.hadoop.hbase.testclassification.LargeTests;
036import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
037import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
038import org.junit.ClassRule;
039import org.junit.Rule;
040import org.junit.Test;
041import org.junit.experimental.categories.Category;
042import org.junit.rules.TestName;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046@Category({ FlakeyTests.class, LargeTests.class })
047public class TestMasterFailover {
048
049  @ClassRule
050  public static final HBaseClassTestRule CLASS_RULE =
051    HBaseClassTestRule.forClass(TestMasterFailover.class);
052
053  private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class);
054  @Rule
055  public TestName name = new TestName();
056
057  /**
058   * Simple test of master failover.
059   * <p>
060   * Starts with three masters. Kills a backup master. Then kills the active master. Ensures the
061   * final master becomes active and we can still contact the cluster.
062   */
063  @Test
064  public void testSimpleMasterFailover() throws Exception {
065    final int NUM_MASTERS = 3;
066    final int NUM_RS = 3;
067
068    // Start the cluster
069    HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
070    try {
071      StartTestingClusterOption option = StartTestingClusterOption.builder().numMasters(NUM_MASTERS)
072        .numRegionServers(NUM_RS).numDataNodes(NUM_RS).build();
073      TEST_UTIL.startMiniCluster(option);
074      SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
075
076      // get all the master threads
077      List<MasterThread> masterThreads = cluster.getMasterThreads();
078
079      // wait for each to come online
080      for (MasterThread mt : masterThreads) {
081        assertTrue(mt.isAlive());
082      }
083
084      // verify only one is the active master and we have right number
085      int numActive = 0;
086      int activeIndex = -1;
087      ServerName activeName = null;
088      HMaster active = null;
089      for (int i = 0; i < masterThreads.size(); i++) {
090        if (masterThreads.get(i).getMaster().isActiveMaster()) {
091          numActive++;
092          activeIndex = i;
093          active = masterThreads.get(activeIndex).getMaster();
094          activeName = active.getServerName();
095        }
096      }
097      assertEquals(1, numActive);
098      assertEquals(NUM_MASTERS, masterThreads.size());
099      LOG.info("Active master " + activeName);
100
101      // Check that ClusterStatus reports the correct active and backup masters
102      assertNotNull(active);
103      ClusterMetrics status = active.getClusterMetrics();
104      assertEquals(activeName, status.getMasterName());
105      assertEquals(2, status.getBackupMasterNames().size());
106
107      // attempt to stop one of the inactive masters
108      int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
109      HMaster master = cluster.getMaster(backupIndex);
110      LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
111      cluster.stopMaster(backupIndex, false);
112      cluster.waitOnMaster(backupIndex);
113
114      // Verify still one active master and it's the same
115      for (int i = 0; i < masterThreads.size(); i++) {
116        if (masterThreads.get(i).getMaster().isActiveMaster()) {
117          assertEquals(activeName, masterThreads.get(i).getMaster().getServerName());
118          activeIndex = i;
119          active = masterThreads.get(activeIndex).getMaster();
120        }
121      }
122      assertEquals(1, numActive);
123      assertEquals(2, masterThreads.size());
124      int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics()
125        .getLiveServerMetrics().size();
126      LOG.info(
127        "Active master " + active.getServerName() + " managing " + rsCount + " regions servers");
128      assertEquals(3, rsCount);
129
130      // wait for the active master to acknowledge loss of the backup from ZK
131      final HMaster activeFinal = active;
132      TEST_UTIL.waitFor(TimeUnit.MINUTES.toMillis(5),
133        () -> activeFinal.getBackupMasters().size() == 1);
134
135      // Check that ClusterStatus reports the correct active and backup masters
136      assertNotNull(active);
137      status = active.getClusterMetrics();
138      assertEquals(activeName, status.getMasterName());
139      assertEquals(1, status.getBackupMasterNames().size());
140
141      // kill the active master
142      LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
143      cluster.stopMaster(activeIndex, false);
144      cluster.waitOnMaster(activeIndex);
145
146      // wait for an active master to show up and be ready
147      assertTrue(cluster.waitForActiveAndReadyMaster());
148
149      LOG.debug("\n\nVerifying backup master is now active\n");
150      // should only have one master now
151      assertEquals(1, masterThreads.size());
152
153      // and he should be active
154      active = masterThreads.get(0).getMaster();
155      assertNotNull(active);
156      status = active.getClusterMetrics();
157      ServerName masterName = status.getMasterName();
158      assertNotNull(masterName);
159      assertEquals(active.getServerName(), masterName);
160      assertTrue(active.isActiveMaster());
161      assertEquals(0, status.getBackupMasterNames().size());
162      int rss = status.getLiveServerMetrics().size();
163      LOG.info("Active master {} managing {} region servers", masterName.getServerName(), rss);
164      assertEquals(3, rss);
165    } finally {
166      // Stop the cluster
167      TEST_UTIL.shutdownMiniCluster();
168    }
169  }
170
171  /**
172   * Test meta in transition when master failover. This test used to manipulate region state up in
173   * zk. That is not allowed any more in hbase2 so I removed that messing. That makes this test
174   * anemic.
175   */
176  @Test
177  public void testMetaInTransitionWhenMasterFailover() throws Exception {
178    // Start the cluster
179    HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
180    TEST_UTIL.startMiniCluster();
181    try {
182      SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
183      LOG.info("Cluster started");
184
185      HMaster activeMaster = cluster.getMaster();
186      ServerName metaServerName = cluster.getServerHoldingMeta();
187      HRegionServer hrs = cluster.getRegionServer(metaServerName);
188
189      // Now kill master, meta should remain on rs, where we placed it before.
190      LOG.info("Aborting master");
191      activeMaster.abort("test-kill");
192      cluster.waitForMasterToStop(activeMaster.getServerName(), 30000);
193      LOG.info("Master has aborted");
194
195      // meta should remain where it was
196      RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper());
197      assertEquals("hbase:meta should be online on RS", metaState.getServerName(), metaServerName);
198      assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
199
200      // Start up a new master
201      LOG.info("Starting up a new master");
202      activeMaster = cluster.startMaster().getMaster();
203      LOG.info("Waiting for master to be ready");
204      cluster.waitForActiveAndReadyMaster();
205      LOG.info("Master is ready");
206
207      // ensure meta is still deployed on RS
208      metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
209      assertEquals("hbase:meta should be online on RS", metaState.getServerName(), metaServerName);
210      assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
211
212      // Done, shutdown the cluster
213    } finally {
214      TEST_UTIL.shutdownMiniCluster();
215    }
216  }
217}