001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertTrue; 022 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.hbase.HBaseClassTestRule; 025import org.apache.hadoop.hbase.HBaseTestingUtility; 026import org.apache.hadoop.hbase.MiniHBaseCluster; 027import org.apache.hadoop.hbase.ServerName; 028import org.apache.hadoop.hbase.TableName; 029import org.apache.hadoop.hbase.client.RegionInfo; 030import org.apache.hadoop.hbase.client.Table; 031import org.apache.hadoop.hbase.master.HMaster; 032import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil; 033import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 034import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; 035import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; 036import org.apache.hadoop.hbase.testclassification.LargeTests; 037import org.apache.hadoop.hbase.testclassification.MasterTests; 038import org.junit.After; 039import org.junit.Before; 040import org.junit.ClassRule; 041import org.junit.Test; 042import org.junit.experimental.categories.Category; 043import org.slf4j.Logger; 044import org.slf4j.LoggerFactory; 045 046@Category({MasterTests.class, LargeTests.class}) 047public class TestServerCrashProcedure { 048 049 @ClassRule 050 public static final HBaseClassTestRule CLASS_RULE = 051 HBaseClassTestRule.forClass(TestServerCrashProcedure.class); 052 053 private static final Logger LOG = LoggerFactory.getLogger(TestServerCrashProcedure.class); 054 055 private HBaseTestingUtility util; 056 057 private ProcedureMetrics serverCrashProcMetrics; 058 private long serverCrashSubmittedCount = 0; 059 private long serverCrashFailedCount = 0; 060 061 private void setupConf(Configuration conf) { 062 conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1); 063 //testxxxDoubleExecution requires only one worker 064 conf.setInt(MasterProcedureConstants.MASTER_URGENT_PROCEDURE_THREADS, 0); 065 conf.set("hbase.balancer.tablesOnMaster", "none"); 066 conf.setInt("hbase.client.retries.number", 3); 067 } 068 069 @Before 070 public void setup() throws Exception { 071 this.util = new HBaseTestingUtility(); 072 setupConf(this.util.getConfiguration()); 073 this.util.startMiniCluster(3); 074 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate( 075 this.util.getHBaseCluster().getMaster().getMasterProcedureExecutor(), false); 076 serverCrashProcMetrics = this.util.getHBaseCluster().getMaster().getMasterMetrics() 077 .getServerCrashProcMetrics(); 078 } 079 080 @After 081 public void tearDown() throws Exception { 082 MiniHBaseCluster cluster = this.util.getHBaseCluster(); 083 HMaster master = cluster == null? null: cluster.getMaster(); 084 if (master != null && master.getMasterProcedureExecutor() != null) { 085 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate( 086 master.getMasterProcedureExecutor(), false); 087 } 088 this.util.shutdownMiniCluster(); 089 } 090 091 092 @Test 093 public void testCrashTargetRs() throws Exception { 094 testRecoveryAndDoubleExecution(false, false); 095 } 096 097 @Test 098 public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception { 099 testRecoveryAndDoubleExecution(true, true); 100 } 101 102 @Test 103 public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception { 104 testRecoveryAndDoubleExecution(false, true); 105 } 106 107 private long getSCPProcId(ProcedureExecutor<?> procExec) { 108 util.waitFor(30000, () -> !procExec.getProcedures().isEmpty()); 109 return procExec.getActiveProcIds().stream().mapToLong(Long::longValue).min().getAsLong(); 110 } 111 112 /** 113 * Run server crash procedure steps twice to test idempotency and that we are persisting all 114 * needed state. 115 */ 116 private void testRecoveryAndDoubleExecution(boolean carryingMeta, boolean doubleExecution) 117 throws Exception { 118 final TableName tableName = TableName.valueOf( 119 "testRecoveryAndDoubleExecution-carryingMeta-" + carryingMeta); 120 final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS, 121 HBaseTestingUtility.KEYS_FOR_HBA_CREATE_TABLE); 122 try { 123 // Load the table with a bit of data so some logs to split and some edits in each region. 124 this.util.loadTable(t, HBaseTestingUtility.COLUMNS[0]); 125 final int count = util.countRows(t); 126 assertTrue("expected some rows", count > 0); 127 final String checksum = util.checksumRows(t); 128 // Run the procedure executor outside the master so we can mess with it. Need to disable 129 // Master's running of the server crash processing. 130 final HMaster master = this.util.getHBaseCluster().getMaster(); 131 final ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor(); 132 // find the first server that match the request and executes the test 133 ServerName rsToKill = null; 134 for (RegionInfo hri : util.getAdmin().getRegions(tableName)) { 135 final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util, hri); 136 if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta) { 137 rsToKill = serverName; 138 break; 139 } 140 } 141 // Enable test flags and then queue the crash procedure. 142 ProcedureTestingUtility.waitNoProcedureRunning(procExec); 143 if (doubleExecution) { 144 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true); 145 // kill the RS 146 AssignmentTestingUtil.killRs(util, rsToKill); 147 long procId = getSCPProcId(procExec); 148 // Now run through the procedure twice crashing the executor on each step... 149 MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId); 150 } else { 151 // kill the RS 152 AssignmentTestingUtil.killRs(util, rsToKill); 153 long procId = getSCPProcId(procExec); 154 ProcedureTestingUtility.waitProcedure(procExec, procId); 155 } 156 // Assert all data came back. 157 assertEquals(count, util.countRows(t)); 158 assertEquals(checksum, util.checksumRows(t)); 159 } catch(Throwable throwable) { 160 LOG.error("Test failed!", throwable); 161 throw throwable; 162 } finally { 163 t.close(); 164 } 165 } 166 167 private void collectMasterMetrics() { 168 serverCrashSubmittedCount = serverCrashProcMetrics.getSubmittedCounter().getCount(); 169 serverCrashFailedCount = serverCrashProcMetrics.getFailedCounter().getCount(); 170 } 171}