001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertTrue;
022
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.HBaseClassTestRule;
025import org.apache.hadoop.hbase.HBaseTestingUtility;
026import org.apache.hadoop.hbase.MiniHBaseCluster;
027import org.apache.hadoop.hbase.ServerName;
028import org.apache.hadoop.hbase.TableName;
029import org.apache.hadoop.hbase.client.RegionInfo;
030import org.apache.hadoop.hbase.client.Table;
031import org.apache.hadoop.hbase.master.HMaster;
032import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
033import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
034import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
035import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
036import org.apache.hadoop.hbase.testclassification.LargeTests;
037import org.apache.hadoop.hbase.testclassification.MasterTests;
038import org.junit.After;
039import org.junit.Before;
040import org.junit.ClassRule;
041import org.junit.Test;
042import org.junit.experimental.categories.Category;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046@Category({MasterTests.class, LargeTests.class})
047public class TestServerCrashProcedure {
048
049  @ClassRule
050  public static final HBaseClassTestRule CLASS_RULE =
051      HBaseClassTestRule.forClass(TestServerCrashProcedure.class);
052
053  private static final Logger LOG = LoggerFactory.getLogger(TestServerCrashProcedure.class);
054
055  private HBaseTestingUtility util;
056
057  private ProcedureMetrics serverCrashProcMetrics;
058  private long serverCrashSubmittedCount = 0;
059  private long serverCrashFailedCount = 0;
060
061  private void setupConf(Configuration conf) {
062    conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
063    //testxxxDoubleExecution requires only one worker
064    conf.setInt(MasterProcedureConstants.MASTER_URGENT_PROCEDURE_THREADS, 0);
065    conf.set("hbase.balancer.tablesOnMaster", "none");
066    conf.setInt("hbase.client.retries.number", 3);
067  }
068
069  @Before
070  public void setup() throws Exception {
071    this.util = new HBaseTestingUtility();
072    setupConf(this.util.getConfiguration());
073    this.util.startMiniCluster(3);
074    ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(
075      this.util.getHBaseCluster().getMaster().getMasterProcedureExecutor(), false);
076    serverCrashProcMetrics = this.util.getHBaseCluster().getMaster().getMasterMetrics()
077        .getServerCrashProcMetrics();
078  }
079
080  @After
081  public void tearDown() throws Exception {
082    MiniHBaseCluster cluster = this.util.getHBaseCluster();
083    HMaster master = cluster == null? null: cluster.getMaster();
084    if (master != null && master.getMasterProcedureExecutor() != null) {
085      ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(
086        master.getMasterProcedureExecutor(), false);
087    }
088    this.util.shutdownMiniCluster();
089  }
090
091
092  @Test
093  public void testCrashTargetRs() throws Exception {
094    testRecoveryAndDoubleExecution(false, false);
095  }
096
097  @Test
098  public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception {
099    testRecoveryAndDoubleExecution(true, true);
100  }
101
102  @Test
103  public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception {
104    testRecoveryAndDoubleExecution(false, true);
105  }
106
107  private long getSCPProcId(ProcedureExecutor<?> procExec) {
108    util.waitFor(30000, () -> !procExec.getProcedures().isEmpty());
109    return procExec.getActiveProcIds().stream().mapToLong(Long::longValue).min().getAsLong();
110  }
111
112  /**
113   * Run server crash procedure steps twice to test idempotency and that we are persisting all
114   * needed state.
115   */
116  private void testRecoveryAndDoubleExecution(boolean carryingMeta, boolean doubleExecution)
117      throws Exception {
118    final TableName tableName = TableName.valueOf(
119      "testRecoveryAndDoubleExecution-carryingMeta-" + carryingMeta);
120    final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS,
121        HBaseTestingUtility.KEYS_FOR_HBA_CREATE_TABLE);
122    try {
123      // Load the table with a bit of data so some logs to split and some edits in each region.
124      this.util.loadTable(t, HBaseTestingUtility.COLUMNS[0]);
125      final int count = util.countRows(t);
126      assertTrue("expected some rows", count > 0);
127      final String checksum = util.checksumRows(t);
128      // Run the procedure executor outside the master so we can mess with it. Need to disable
129      // Master's running of the server crash processing.
130      final HMaster master = this.util.getHBaseCluster().getMaster();
131      final ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor();
132      // find the first server that match the request and executes the test
133      ServerName rsToKill = null;
134      for (RegionInfo hri : util.getAdmin().getRegions(tableName)) {
135        final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util, hri);
136        if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta) {
137          rsToKill = serverName;
138          break;
139        }
140      }
141      // Enable test flags and then queue the crash procedure.
142      ProcedureTestingUtility.waitNoProcedureRunning(procExec);
143      if (doubleExecution) {
144        ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
145        // kill the RS
146        AssignmentTestingUtil.killRs(util, rsToKill);
147        long procId = getSCPProcId(procExec);
148        // Now run through the procedure twice crashing the executor on each step...
149        MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
150      } else {
151        // kill the RS
152        AssignmentTestingUtil.killRs(util, rsToKill);
153        long procId = getSCPProcId(procExec);
154        ProcedureTestingUtility.waitProcedure(procExec, procId);
155      }
156      // Assert all data came back.
157      assertEquals(count, util.countRows(t));
158      assertEquals(checksum, util.checksumRows(t));
159    } catch(Throwable throwable) {
160      LOG.error("Test failed!", throwable);
161      throw throwable;
162    } finally {
163      t.close();
164    }
165  }
166
167  private void collectMasterMetrics() {
168    serverCrashSubmittedCount = serverCrashProcMetrics.getSubmittedCounter().getCount();
169    serverCrashFailedCount = serverCrashProcMetrics.getFailedCounter().getCount();
170  }
171}