001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.concurrent.CountDownLatch; 022import org.apache.hadoop.hbase.HBaseTestingUtil; 023import org.apache.hadoop.hbase.ProcedureTestUtil; 024import org.apache.hadoop.hbase.ServerName; 025import org.apache.hadoop.hbase.TableName; 026import org.apache.hadoop.hbase.client.Put; 027import org.apache.hadoop.hbase.client.RegionInfo; 028import org.apache.hadoop.hbase.client.Table; 029import org.apache.hadoop.hbase.master.HMaster; 030import org.apache.hadoop.hbase.master.ServerManager; 031import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 032import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 033import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; 034import org.apache.hadoop.hbase.procedure2.Procedure; 035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 036import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; 037import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; 038import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; 039import org.apache.hadoop.hbase.regionserver.HRegionServer; 040import org.apache.hadoop.hbase.testclassification.LargeTests; 041import org.apache.hadoop.hbase.testclassification.MasterTests; 042import org.apache.hadoop.hbase.util.Bytes; 043import org.junit.jupiter.api.AfterAll; 044import org.junit.jupiter.api.BeforeAll; 045import org.junit.jupiter.api.Tag; 046import org.junit.jupiter.api.Test; 047import org.slf4j.Logger; 048import org.slf4j.LoggerFactory; 049 050/** 051 * Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the 052 * CPUs. 053 */ 054@Tag(MasterTests.TAG) 055@Tag(LargeTests.TAG) 056public class TestCloseRegionWhileRSCrash { 057 private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class); 058 059 private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 060 061 private static TableName TABLE_NAME = TableName.valueOf("Backoff"); 062 063 private static byte[] CF = Bytes.toBytes("cf"); 064 065 private static CountDownLatch ARRIVE = new CountDownLatch(1); 066 067 private static CountDownLatch RESUME = new CountDownLatch(1); 068 069 public static final class DummyServerProcedure extends Procedure<MasterProcedureEnv> 070 implements ServerProcedureInterface { 071 072 private ServerName serverName; 073 074 public DummyServerProcedure() { 075 } 076 077 public DummyServerProcedure(ServerName serverName) { 078 this.serverName = serverName; 079 } 080 081 @Override 082 public ServerName getServerName() { 083 return serverName; 084 } 085 086 @Override 087 public boolean hasMetaTableRegion() { 088 return false; 089 } 090 091 @Override 092 public ServerOperationType getServerOperationType() { 093 return ServerOperationType.CRASH_HANDLER; 094 } 095 096 @Override 097 protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) 098 throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { 099 ARRIVE.countDown(); 100 RESUME.await(); 101 return null; 102 } 103 104 @Override 105 protected LockState acquireLock(final MasterProcedureEnv env) { 106 if (env.getProcedureScheduler().waitServerExclusiveLock(this, getServerName())) { 107 return LockState.LOCK_EVENT_WAIT; 108 } 109 return LockState.LOCK_ACQUIRED; 110 } 111 112 @Override 113 protected void releaseLock(final MasterProcedureEnv env) { 114 env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName()); 115 } 116 117 @Override 118 protected boolean holdLock(MasterProcedureEnv env) { 119 return true; 120 } 121 122 @Override 123 protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException { 124 } 125 126 @Override 127 protected boolean abort(MasterProcedureEnv env) { 128 return false; 129 } 130 131 @Override 132 protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { 133 134 } 135 136 @Override 137 protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { 138 } 139 } 140 141 @BeforeAll 142 public static void setUp() throws Exception { 143 UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); 144 UTIL.startMiniCluster(3); 145 UTIL.createTable(TABLE_NAME, CF); 146 UTIL.getAdmin().balancerSwitch(false, true); 147 HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME); 148 if (!srcRs.getRegions(TableName.META_TABLE_NAME).isEmpty()) { 149 RegionInfo metaRegion = srcRs.getRegions(TableName.META_TABLE_NAME).get(0).getRegionInfo(); 150 HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); 151 UTIL.getAdmin().move(metaRegion.getEncodedNameAsBytes(), dstRs.getServerName()); 152 UTIL.waitFor(30000, () -> !dstRs.getRegions(TableName.META_TABLE_NAME).isEmpty()); 153 } 154 } 155 156 @AfterAll 157 public static void tearDown() throws Exception { 158 UTIL.shutdownMiniCluster(); 159 } 160 161 @Test 162 public void testRetryBackoff() throws IOException, InterruptedException { 163 HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME); 164 RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo(); 165 HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); 166 ProcedureExecutor<MasterProcedureEnv> procExec = 167 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 168 procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); 169 ARRIVE.await(); 170 UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName()); 171 UTIL.waitFor(30000, 172 () -> procExec.getProcedures().stream().anyMatch(p -> p instanceof ServerCrashProcedure)); 173 Thread t = new Thread(() -> { 174 try { 175 UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName()); 176 } catch (IOException e) { 177 LOG.info("Failed move of {}", region.getRegionNameAsString(), e); 178 } 179 }); 180 t.start(); 181 // wait until we enter the WAITING_TIMEOUT state 182 ProcedureTestUtil.waitUntilProcedureWaitingTimeout(UTIL, TransitRegionStateProcedure.class, 183 30000); 184 // wait until the timeout value increase three times 185 ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3); 186 // close connection to make sure that we can not finish the TRSP 187 final HMaster master = UTIL.getMiniHBaseCluster().getMaster(); 188 master.getConnection().close(); 189 RESUME.countDown(); 190 UTIL.waitFor(30000, () -> !master.isAlive()); 191 // here we start a new master 192 HMaster master2 = UTIL.getMiniHBaseCluster().startMaster().getMaster(); 193 LOG.info("Master2 {}, joining move thread", master2.getServerName()); 194 t.join(); 195 // Make sure that the region is online, it may not on the original target server, as we will set 196 // forceNewPlan to true if there is a server crash 197 try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) { 198 table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1))); 199 } 200 // Make sure that the region is online, it may not be on the original target server, as we will 201 // set forceNewPlan to true if there is a server crash. 202 try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) { 203 table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1))); 204 } 205 } 206}