001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertTrue;
022
023import java.io.IOException;
024
025import org.apache.hadoop.conf.Configuration;
026import org.apache.hadoop.hbase.HBaseClassTestRule;
027import org.apache.hadoop.hbase.HBaseTestingUtility;
028import org.apache.hadoop.hbase.MiniHBaseCluster;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.TableName;
031import org.apache.hadoop.hbase.client.RegionInfo;
032import org.apache.hadoop.hbase.client.Table;
033import org.apache.hadoop.hbase.master.HMaster;
034import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
036import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
037import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
038import org.apache.hadoop.hbase.testclassification.LargeTests;
039import org.apache.hadoop.hbase.testclassification.MasterTests;
040import org.junit.After;
041import org.junit.Before;
042import org.junit.ClassRule;
043import org.junit.Test;
044import org.junit.experimental.categories.Category;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047
048@Category({MasterTests.class, LargeTests.class})
049public class TestServerCrashProcedure {
050
051  @ClassRule
052  public static final HBaseClassTestRule CLASS_RULE =
053      HBaseClassTestRule.forClass(TestServerCrashProcedure.class);
054
055  private static final Logger LOG = LoggerFactory.getLogger(TestServerCrashProcedure.class);
056
057  protected HBaseTestingUtility util;
058
059  private ProcedureMetrics serverCrashProcMetrics;
060  private long serverCrashSubmittedCount = 0;
061  private long serverCrashFailedCount = 0;
062
063  private void setupConf(Configuration conf) {
064    conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
065    //testxxxDoubleExecution requires only one worker
066    conf.setInt(MasterProcedureConstants.MASTER_URGENT_PROCEDURE_THREADS, 0);
067    conf.set("hbase.balancer.tablesOnMaster", "none");
068    conf.setInt("hbase.client.retries.number", 3);
069  }
070
071  @Before
072  public void setup() throws Exception {
073    this.util = new HBaseTestingUtility();
074    setupConf(this.util.getConfiguration());
075    startMiniCluster();
076    ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(
077      this.util.getHBaseCluster().getMaster().getMasterProcedureExecutor(), false);
078    serverCrashProcMetrics = this.util.getHBaseCluster().getMaster().getMasterMetrics()
079        .getServerCrashProcMetrics();
080  }
081
082  protected void startMiniCluster() throws Exception {
083    this.util.startMiniCluster(3);
084  }
085
086  @After
087  public void tearDown() throws Exception {
088    MiniHBaseCluster cluster = this.util.getHBaseCluster();
089    HMaster master = cluster == null? null: cluster.getMaster();
090    if (master != null && master.getMasterProcedureExecutor() != null) {
091      ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(
092        master.getMasterProcedureExecutor(), false);
093    }
094    this.util.shutdownMiniCluster();
095  }
096
097
098  @Test
099  public void testCrashTargetRs() throws Exception {
100    testRecoveryAndDoubleExecution(false, false);
101  }
102
103  @Test
104  public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception {
105    testRecoveryAndDoubleExecution(true, true);
106  }
107
108  @Test
109  public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception {
110    testRecoveryAndDoubleExecution(false, true);
111  }
112
113  private long getSCPProcId(ProcedureExecutor<?> procExec) {
114    util.waitFor(30000, () -> !procExec.getProcedures().isEmpty());
115    return procExec.getActiveProcIds().stream().mapToLong(Long::longValue).min().getAsLong();
116  }
117
118  /**
119   * Run server crash procedure steps twice to test idempotency and that we are persisting all
120   * needed state.
121   */
122  private void testRecoveryAndDoubleExecution(boolean carryingMeta, boolean doubleExecution)
123      throws Exception {
124    final TableName tableName = TableName.valueOf("testRecoveryAndDoubleExecution-carryingMeta-"
125        + carryingMeta + "-doubleExecution-" + doubleExecution);
126    try (Table t = createTable(tableName)) {
127      // Load the table with a bit of data so some logs to split and some edits in each region.
128      this.util.loadTable(t, HBaseTestingUtility.COLUMNS[0]);
129      final int count = util.countRows(t);
130      assertTrue("expected some rows", count > 0);
131      final String checksum = util.checksumRows(t);
132      // Run the procedure executor outside the master so we can mess with it. Need to disable
133      // Master's running of the server crash processing.
134      final HMaster master = this.util.getHBaseCluster().getMaster();
135      final ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor();
136      // find the first server that match the request and executes the test
137      ServerName rsToKill = null;
138      for (RegionInfo hri : util.getAdmin().getRegions(tableName)) {
139        final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util, hri);
140        if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta) {
141          rsToKill = serverName;
142          break;
143        }
144      }
145      // Enable test flags and then queue the crash procedure.
146      ProcedureTestingUtility.waitNoProcedureRunning(procExec);
147      if (doubleExecution) {
148        ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
149        // kill the RS
150        AssignmentTestingUtil.killRs(util, rsToKill);
151        long procId = getSCPProcId(procExec);
152        // Now run through the procedure twice crashing the executor on each step...
153        MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
154      } else {
155        // kill the RS
156        AssignmentTestingUtil.killRs(util, rsToKill);
157        long procId = getSCPProcId(procExec);
158        ProcedureTestingUtility.waitProcedure(procExec, procId);
159      }
160      assertEquals(count, util.countRows(t));
161      assertEquals(checksum, util.checksumRows(t));
162    } catch (Throwable throwable) {
163      LOG.error("Test failed!", throwable);
164      throw throwable;
165    }
166  }
167
168  protected Table createTable(final TableName tableName) throws IOException {
169    final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS,
170        HBaseTestingUtility.KEYS_FOR_HBA_CREATE_TABLE);
171    return t;
172  }
173
174  private void collectMasterMetrics() {
175    serverCrashSubmittedCount = serverCrashProcMetrics.getSubmittedCounter().getCount();
176    serverCrashFailedCount = serverCrashProcMetrics.getFailedCounter().getCount();
177  }
178}