001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.concurrent.CountDownLatch; 022import org.apache.hadoop.hbase.HBaseClassTestRule; 023import org.apache.hadoop.hbase.HBaseTestingUtility; 024import org.apache.hadoop.hbase.ProcedureTestUtil; 025import org.apache.hadoop.hbase.ServerName; 026import org.apache.hadoop.hbase.TableName; 027import org.apache.hadoop.hbase.client.Put; 028import org.apache.hadoop.hbase.client.RegionInfo; 029import org.apache.hadoop.hbase.client.Table; 030import org.apache.hadoop.hbase.master.HMaster; 031import org.apache.hadoop.hbase.master.ServerManager; 032import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 033import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 034import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; 035import org.apache.hadoop.hbase.procedure2.Procedure; 036import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 037import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; 038import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; 039import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; 040import org.apache.hadoop.hbase.regionserver.HRegionServer; 041import org.apache.hadoop.hbase.testclassification.MasterTests; 042import org.apache.hadoop.hbase.testclassification.MediumTests; 043import org.apache.hadoop.hbase.util.Bytes; 044import org.junit.AfterClass; 045import org.junit.BeforeClass; 046import org.junit.ClassRule; 047import org.junit.Test; 048import org.junit.experimental.categories.Category; 049 050/** 051 * Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the 052 * CPUs. 053 */ 054@Category({ MasterTests.class, MediumTests.class }) 055public class TestCloseRegionWhileRSCrash { 056 057 @ClassRule 058 public static final HBaseClassTestRule CLASS_RULE = 059 HBaseClassTestRule.forClass(TestCloseRegionWhileRSCrash.class); 060 061 private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); 062 063 private static TableName TABLE_NAME = TableName.valueOf("Backoff"); 064 065 private static byte[] CF = Bytes.toBytes("cf"); 066 067 private static CountDownLatch ARRIVE = new CountDownLatch(1); 068 069 private static CountDownLatch RESUME = new CountDownLatch(1); 070 071 public static final class DummyServerProcedure extends Procedure<MasterProcedureEnv> 072 implements ServerProcedureInterface { 073 074 private ServerName serverName; 075 076 public DummyServerProcedure() { 077 } 078 079 public DummyServerProcedure(ServerName serverName) { 080 this.serverName = serverName; 081 } 082 083 @Override 084 public ServerName getServerName() { 085 return serverName; 086 } 087 088 @Override 089 public boolean hasMetaTableRegion() { 090 return false; 091 } 092 093 @Override 094 public ServerOperationType getServerOperationType() { 095 return ServerOperationType.CRASH_HANDLER; 096 } 097 098 @Override 099 protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) 100 throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { 101 ARRIVE.countDown(); 102 RESUME.await(); 103 return null; 104 } 105 106 @Override 107 protected LockState acquireLock(final MasterProcedureEnv env) { 108 if (env.getProcedureScheduler().waitServerExclusiveLock(this, getServerName())) { 109 return LockState.LOCK_EVENT_WAIT; 110 } 111 return LockState.LOCK_ACQUIRED; 112 } 113 114 @Override 115 protected void releaseLock(final MasterProcedureEnv env) { 116 env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName()); 117 } 118 119 @Override 120 protected boolean holdLock(MasterProcedureEnv env) { 121 return true; 122 } 123 124 @Override 125 protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException { 126 } 127 128 @Override 129 protected boolean abort(MasterProcedureEnv env) { 130 return false; 131 } 132 133 @Override 134 protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { 135 136 } 137 138 @Override 139 protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { 140 } 141 } 142 143 @BeforeClass 144 public static void setUp() throws Exception { 145 UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); 146 UTIL.startMiniCluster(3); 147 UTIL.createTable(TABLE_NAME, CF); 148 UTIL.getAdmin().balancerSwitch(false, true); 149 HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME); 150 if (!srcRs.getRegions(TableName.META_TABLE_NAME).isEmpty()) { 151 RegionInfo metaRegion = srcRs.getRegions(TableName.META_TABLE_NAME).get(0).getRegionInfo(); 152 HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); 153 UTIL.getAdmin().move(metaRegion.getEncodedNameAsBytes(), dstRs.getServerName()); 154 UTIL.waitFor(30000, () -> !dstRs.getRegions(TableName.META_TABLE_NAME).isEmpty()); 155 } 156 } 157 158 @AfterClass 159 public static void tearDown() throws Exception { 160 UTIL.shutdownMiniCluster(); 161 } 162 163 @Test 164 public void testRetryBackoff() throws IOException, InterruptedException { 165 HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME); 166 RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo(); 167 HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); 168 ProcedureExecutor<MasterProcedureEnv> procExec = 169 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 170 procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); 171 ARRIVE.await(); 172 UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName()); 173 UTIL.waitFor(30000, 174 () -> procExec.getProcedures().stream().anyMatch(p -> p instanceof ServerCrashProcedure)); 175 Thread t = new Thread(() -> { 176 try { 177 UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName()); 178 } catch (IOException e) { 179 } 180 }); 181 t.start(); 182 // wait until we enter the WAITING_TIMEOUT state 183 ProcedureTestUtil.waitUntilProcedureWaitingTimeout(UTIL, TransitRegionStateProcedure.class, 184 30000); 185 // wait until the timeout value increase three times 186 ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3); 187 // close connection to make sure that we can not finish the TRSP 188 HMaster master = UTIL.getMiniHBaseCluster().getMaster(); 189 master.getConnection().close(); 190 RESUME.countDown(); 191 UTIL.waitFor(30000, () -> !master.isAlive()); 192 // here we start a new master 193 UTIL.getMiniHBaseCluster().startMaster(); 194 t.join(); 195 // Make sure that the region is online, it may not on the original target server, as we will set 196 // forceNewPlan to true if there is a server crash 197 try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) { 198 table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1))); 199 } 200 } 201}