001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertTrue; 022 023import java.io.IOException; 024 025import org.apache.hadoop.conf.Configuration; 026import org.apache.hadoop.hbase.HBaseClassTestRule; 027import org.apache.hadoop.hbase.HBaseTestingUtility; 028import org.apache.hadoop.hbase.MiniHBaseCluster; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.TableName; 031import org.apache.hadoop.hbase.client.RegionInfo; 032import org.apache.hadoop.hbase.client.Table; 033import org.apache.hadoop.hbase.master.HMaster; 034import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil; 035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 036import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; 037import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; 038import org.apache.hadoop.hbase.testclassification.LargeTests; 039import org.apache.hadoop.hbase.testclassification.MasterTests; 040import org.junit.After; 041import org.junit.Before; 042import org.junit.ClassRule; 043import org.junit.Test; 044import org.junit.experimental.categories.Category; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047 048@Category({MasterTests.class, LargeTests.class}) 049public class TestServerCrashProcedure { 050 051 @ClassRule 052 public static final HBaseClassTestRule CLASS_RULE = 053 HBaseClassTestRule.forClass(TestServerCrashProcedure.class); 054 055 private static final Logger LOG = LoggerFactory.getLogger(TestServerCrashProcedure.class); 056 057 protected HBaseTestingUtility util; 058 059 private ProcedureMetrics serverCrashProcMetrics; 060 private long serverCrashSubmittedCount = 0; 061 private long serverCrashFailedCount = 0; 062 063 private void setupConf(Configuration conf) { 064 conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1); 065 //testxxxDoubleExecution requires only one worker 066 conf.setInt(MasterProcedureConstants.MASTER_URGENT_PROCEDURE_THREADS, 0); 067 conf.set("hbase.balancer.tablesOnMaster", "none"); 068 conf.setInt("hbase.client.retries.number", 3); 069 } 070 071 @Before 072 public void setup() throws Exception { 073 this.util = new HBaseTestingUtility(); 074 setupConf(this.util.getConfiguration()); 075 startMiniCluster(); 076 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate( 077 this.util.getHBaseCluster().getMaster().getMasterProcedureExecutor(), false); 078 serverCrashProcMetrics = this.util.getHBaseCluster().getMaster().getMasterMetrics() 079 .getServerCrashProcMetrics(); 080 } 081 082 protected void startMiniCluster() throws Exception { 083 this.util.startMiniCluster(3); 084 } 085 086 @After 087 public void tearDown() throws Exception { 088 MiniHBaseCluster cluster = this.util.getHBaseCluster(); 089 HMaster master = cluster == null? null: cluster.getMaster(); 090 if (master != null && master.getMasterProcedureExecutor() != null) { 091 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate( 092 master.getMasterProcedureExecutor(), false); 093 } 094 this.util.shutdownMiniCluster(); 095 } 096 097 098 @Test 099 public void testCrashTargetRs() throws Exception { 100 testRecoveryAndDoubleExecution(false, false); 101 } 102 103 @Test 104 public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception { 105 testRecoveryAndDoubleExecution(true, true); 106 } 107 108 @Test 109 public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception { 110 testRecoveryAndDoubleExecution(false, true); 111 } 112 113 private long getSCPProcId(ProcedureExecutor<?> procExec) { 114 util.waitFor(30000, () -> !procExec.getProcedures().isEmpty()); 115 return procExec.getActiveProcIds().stream().mapToLong(Long::longValue).min().getAsLong(); 116 } 117 118 /** 119 * Run server crash procedure steps twice to test idempotency and that we are persisting all 120 * needed state. 121 */ 122 private void testRecoveryAndDoubleExecution(boolean carryingMeta, boolean doubleExecution) 123 throws Exception { 124 final TableName tableName = TableName.valueOf("testRecoveryAndDoubleExecution-carryingMeta-" 125 + carryingMeta + "-doubleExecution-" + doubleExecution); 126 try (Table t = createTable(tableName)) { 127 // Load the table with a bit of data so some logs to split and some edits in each region. 128 this.util.loadTable(t, HBaseTestingUtility.COLUMNS[0]); 129 final int count = util.countRows(t); 130 assertTrue("expected some rows", count > 0); 131 final String checksum = util.checksumRows(t); 132 // Run the procedure executor outside the master so we can mess with it. Need to disable 133 // Master's running of the server crash processing. 134 final HMaster master = this.util.getHBaseCluster().getMaster(); 135 final ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor(); 136 // find the first server that match the request and executes the test 137 ServerName rsToKill = null; 138 for (RegionInfo hri : util.getAdmin().getRegions(tableName)) { 139 final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util, hri); 140 if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta) { 141 rsToKill = serverName; 142 break; 143 } 144 } 145 // Enable test flags and then queue the crash procedure. 146 ProcedureTestingUtility.waitNoProcedureRunning(procExec); 147 if (doubleExecution) { 148 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true); 149 // kill the RS 150 AssignmentTestingUtil.killRs(util, rsToKill); 151 long procId = getSCPProcId(procExec); 152 // Now run through the procedure twice crashing the executor on each step... 153 MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId); 154 } else { 155 // kill the RS 156 AssignmentTestingUtil.killRs(util, rsToKill); 157 long procId = getSCPProcId(procExec); 158 ProcedureTestingUtility.waitProcedure(procExec, procId); 159 } 160 assertEquals(count, util.countRows(t)); 161 assertEquals(checksum, util.checksumRows(t)); 162 } catch (Throwable throwable) { 163 LOG.error("Test failed!", throwable); 164 throw throwable; 165 } 166 } 167 168 protected Table createTable(final TableName tableName) throws IOException { 169 final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS, 170 HBaseTestingUtility.KEYS_FOR_HBA_CREATE_TABLE); 171 return t; 172 } 173 174 private void collectMasterMetrics() { 175 serverCrashSubmittedCount = serverCrashProcMetrics.getSubmittedCounter().getCount(); 176 serverCrashFailedCount = serverCrashProcMetrics.getFailedCounter().getCount(); 177 } 178}