001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertNotNull;
022import static org.junit.Assert.assertNull;
023import static org.junit.Assert.assertTrue;
024
025import java.io.IOException;
026import java.util.List;
027import java.util.Optional;
028import java.util.concurrent.CountDownLatch;
029import java.util.stream.Collectors;
030import org.apache.hadoop.conf.Configuration;
031import org.apache.hadoop.hbase.CompatibilityFactory;
032import org.apache.hadoop.hbase.HBaseClassTestRule;
033import org.apache.hadoop.hbase.ServerName;
034import org.apache.hadoop.hbase.StartTestingClusterOption;
035import org.apache.hadoop.hbase.TableName;
036import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
037import org.apache.hadoop.hbase.client.RegionInfo;
038import org.apache.hadoop.hbase.client.Table;
039import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
040import org.apache.hadoop.hbase.master.assignment.ServerState;
041import org.apache.hadoop.hbase.master.assignment.ServerStateNode;
042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
043import org.apache.hadoop.hbase.master.region.MasterRegion;
044import org.apache.hadoop.hbase.procedure2.Procedure;
045import org.apache.hadoop.hbase.test.MetricsAssertHelper;
046import org.apache.hadoop.hbase.testclassification.LargeTests;
047import org.apache.hadoop.hbase.testclassification.MasterTests;
048import org.junit.ClassRule;
049import org.junit.Test;
050import org.junit.experimental.categories.Category;
051import org.slf4j.Logger;
052import org.slf4j.LoggerFactory;
053
054@Category({ MasterTests.class, LargeTests.class })
055public class TestClusterRestartFailover extends AbstractTestRestartCluster {
056
057  @ClassRule
058  public static final HBaseClassTestRule CLASS_RULE =
059    HBaseClassTestRule.forClass(TestClusterRestartFailover.class);
060
061  private static final Logger LOG = LoggerFactory.getLogger(TestClusterRestartFailover.class);
062  private static final MetricsAssertHelper metricsHelper =
063    CompatibilityFactory.getInstance(MetricsAssertHelper.class);
064
065  private volatile static CountDownLatch SCP_LATCH;
066  private static ServerName SERVER_FOR_TEST;
067
068  @Override
069  protected boolean splitWALCoordinatedByZk() {
070    return true;
071  }
072
073  private ServerStateNode getServerStateNode(ServerName serverName) {
074    return UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
075      .getServerNode(serverName);
076  }
077
078  /**
079   * Test for HBASE-22964
080   */
081  @Test
082  public void test() throws Exception {
083    setupCluster();
084    setupTable();
085
086    SERVER_FOR_TEST = UTIL.getHBaseCluster().getRegionServer(0).getServerName();
087    UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
088    ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST);
089    assertNotNull(serverNode);
090    assertTrue("serverNode should be ONLINE when cluster runs normally",
091      serverNode.isInState(ServerState.ONLINE));
092
093    SCP_LATCH = new CountDownLatch(1);
094
095    // Shutdown cluster and restart
096    List<Integer> ports =
097      UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream()
098        .map(serverName -> serverName.getPort()).collect(Collectors.toList());
099    LOG.info("Shutting down cluster");
100    UTIL.getHBaseCluster().killAll();
101    UTIL.getHBaseCluster().waitUntilShutDown();
102    LOG.info("Restarting cluster");
103    UTIL.restartHBaseCluster(StartTestingClusterOption.builder().masterClass(HMasterForTest.class)
104      .numMasters(1).numRegionServers(3).rsPorts(ports).build());
105    LOG.info("Started cluster");
106    UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized());
107    LOG.info("Started cluster master, waiting for {}", SERVER_FOR_TEST);
108    UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
109    UTIL.waitFor(30000, new ExplainingPredicate<Exception>() {
110
111      @Override
112      public boolean evaluate() throws Exception {
113        return !getServerStateNode(SERVER_FOR_TEST).isInState(ServerState.ONLINE);
114      }
115
116      @Override
117      public String explainFailure() throws Exception {
118        return "serverNode should not be ONLINE during SCP processing";
119      }
120    });
121    Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream()
122      .filter(p -> (p instanceof ServerCrashProcedure)
123        && ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST))
124      .findAny();
125    assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent());
126    assertEquals("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
127      Procedure.NO_PROC_ID,
128      UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
129
130    // Wait the SCP to finish
131    LOG.info("Waiting on latch");
132    SCP_LATCH.countDown();
133    UTIL.waitFor(60000, () -> procedure.get().isFinished());
134    assertNull("serverNode should be deleted after SCP finished",
135      getServerStateNode(SERVER_FOR_TEST));
136
137    assertEquals(
138      "Even when the SCP is finished, the duplicate SCP should not be scheduled for "
139        + SERVER_FOR_TEST,
140      Procedure.NO_PROC_ID,
141      UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
142
143    MetricsMasterSource masterSource =
144      UTIL.getHBaseCluster().getMaster().getMasterMetrics().getMetricsSource();
145    metricsHelper.assertCounter(MetricsMasterSource.SERVER_CRASH_METRIC_PREFIX + "SubmittedCount",
146      3, masterSource);
147  }
148
149  private void setupCluster() throws Exception {
150    LOG.info("Setup cluster");
151    UTIL.startMiniCluster(StartTestingClusterOption.builder().masterClass(HMasterForTest.class)
152      .numMasters(1).numRegionServers(3).build());
153    // this test has been flaky. When it is rerun by surefire, the underlying minicluster isn't
154    // completely cleaned. specifically, the metrics system isn't reset. The result is an otherwise
155    // successful re-run is failed because there's 8 or 12 SCPcounts instead of the 4 that a
156    // single run of the test would otherwise produce. Thus, explicitly reset the metrics source
157    // each time we setup the cluster.
158    UTIL.getMiniHBaseCluster().getMaster().getMasterMetrics().getMetricsSource().init();
159    LOG.info("Cluster is up");
160    UTIL.waitFor(60000, () -> UTIL.getMiniHBaseCluster().getMaster().isInitialized());
161    LOG.info("Master is up");
162    // wait for all SCPs finished
163    UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream()
164      .noneMatch(p -> p instanceof ServerCrashProcedure));
165    LOG.info("No SCPs");
166  }
167
168  private void setupTable() throws Exception {
169    TableName tableName = TABLES[0];
170    UTIL.createMultiRegionTable(tableName, FAMILY);
171    UTIL.waitTableAvailable(tableName);
172    Table table = UTIL.getConnection().getTable(tableName);
173    for (int i = 0; i < 100; i++) {
174      UTIL.loadTable(table, FAMILY);
175    }
176  }
177
178  public static final class HMasterForTest extends HMaster {
179
180    public HMasterForTest(Configuration conf) throws IOException {
181      super(conf);
182    }
183
184    @Override
185    protected AssignmentManager createAssignmentManager(MasterServices master,
186      MasterRegion masterRegion) {
187      return new AssignmentManagerForTest(master, masterRegion);
188    }
189  }
190
191  private static final class AssignmentManagerForTest extends AssignmentManager {
192
193    public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) {
194      super(master, masterRegion);
195    }
196
197    @Override
198    public List<RegionInfo> getRegionsOnServer(ServerName serverName) {
199      List<RegionInfo> regions = super.getRegionsOnServer(serverName);
200      // ServerCrashProcedure will call this method, so wait the CountDownLatch here
201      if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName.equals(SERVER_FOR_TEST)) {
202        try {
203          LOG.info("ServerCrashProcedure wait the CountDownLatch here");
204          SCP_LATCH.await();
205          LOG.info("Continue the ServerCrashProcedure");
206          SCP_LATCH = null;
207        } catch (InterruptedException e) {
208          throw new RuntimeException(e);
209        }
210      }
211      return regions;
212    }
213  }
214}