001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.List; 022import java.util.concurrent.CompletableFuture; 023import java.util.concurrent.CountDownLatch; 024import java.util.concurrent.Future; 025import org.apache.hadoop.conf.Configuration; 026import org.apache.hadoop.hbase.HBaseTestingUtil; 027import org.apache.hadoop.hbase.HConstants; 028import org.apache.hadoop.hbase.ServerName; 029import org.apache.hadoop.hbase.TableName; 030import org.apache.hadoop.hbase.client.RegionInfo; 031import org.apache.hadoop.hbase.master.HMaster; 032import org.apache.hadoop.hbase.master.MasterServices; 033import org.apache.hadoop.hbase.master.RegionPlan; 034import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher; 035import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 036import org.apache.hadoop.hbase.master.region.MasterRegion; 037import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 038import org.apache.hadoop.hbase.testclassification.LargeTests; 039import org.apache.hadoop.hbase.testclassification.MasterTests; 040import org.apache.hadoop.hbase.util.Bytes; 041import org.apache.hadoop.hbase.util.FutureUtils; 042import org.apache.zookeeper.KeeperException; 043import org.junit.jupiter.api.AfterAll; 044import org.junit.jupiter.api.BeforeAll; 045import org.junit.jupiter.api.Tag; 046import org.junit.jupiter.api.Test; 047 048/** 049 * Testcase for HBASE-23594. 050 */ 051@Tag(MasterTests.TAG) 052@Tag(LargeTests.TAG) 053public class TestRaceBetweenSCPAndTRSP { 054 055 private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 056 057 private static TableName NAME = TableName.valueOf("Race"); 058 059 private static byte[] CF = Bytes.toBytes("cf"); 060 061 private static CountDownLatch ARRIVE_REGION_OPENING; 062 063 private static CountDownLatch RESUME_REGION_OPENING; 064 065 private static CountDownLatch ARRIVE_GET_REGIONS_ON_SERVER; 066 067 private static CountDownLatch RESUME_GET_REGIONS_ON_SERVER; 068 069 private static final class AssignmentManagerForTest extends AssignmentManager { 070 071 public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) { 072 super(master, masterRegion); 073 } 074 075 @Override 076 CompletableFuture<Void> regionOpening(RegionStateNode regionNode) { 077 CompletableFuture<Void> future = super.regionOpening(regionNode); 078 try { 079 // wait until the operation done, then trigger later processing, to make the test more 080 // stable 081 FutureUtils.get(future); 082 } catch (IOException e) { 083 } 084 if (regionNode.getRegionInfo().getTable().equals(NAME) && ARRIVE_REGION_OPENING != null) { 085 ARRIVE_REGION_OPENING.countDown(); 086 ARRIVE_REGION_OPENING = null; 087 try { 088 RESUME_REGION_OPENING.await(); 089 } catch (InterruptedException e) { 090 } 091 } 092 return future; 093 } 094 095 @Override 096 public List<RegionInfo> getRegionsOnServer(ServerName serverName) { 097 List<RegionInfo> regions = super.getRegionsOnServer(serverName); 098 if (ARRIVE_GET_REGIONS_ON_SERVER != null) { 099 ARRIVE_GET_REGIONS_ON_SERVER.countDown(); 100 ARRIVE_GET_REGIONS_ON_SERVER = null; 101 try { 102 RESUME_GET_REGIONS_ON_SERVER.await(); 103 } catch (InterruptedException e) { 104 } 105 } 106 return regions; 107 } 108 } 109 110 public static final class HMasterForTest extends HMaster { 111 112 public HMasterForTest(Configuration conf) throws IOException, KeeperException { 113 super(conf); 114 } 115 116 @Override 117 protected AssignmentManager createAssignmentManager(MasterServices master, 118 MasterRegion masterRegion) { 119 return new AssignmentManagerForTest(master, masterRegion); 120 } 121 } 122 123 @BeforeAll 124 public static void setUp() throws Exception { 125 UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); 126 UTIL.startMiniCluster(2); 127 UTIL.createTable(NAME, CF); 128 UTIL.waitTableAvailable(NAME); 129 UTIL.getAdmin().balancerSwitch(false, true); 130 } 131 132 @AfterAll 133 public static void tearDown() throws Exception { 134 UTIL.shutdownMiniCluster(); 135 } 136 137 @Test 138 public void test() throws Exception { 139 RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); 140 AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); 141 ServerName sn = am.getRegionStates().getRegionState(region).getServerName(); 142 143 // Assign the CountDownLatches that get nulled in background threads else we NPE checking 144 // the static. 145 ARRIVE_REGION_OPENING = new CountDownLatch(1); 146 CountDownLatch arriveRegionOpening = ARRIVE_REGION_OPENING; 147 RESUME_REGION_OPENING = new CountDownLatch(1); 148 ARRIVE_GET_REGIONS_ON_SERVER = new CountDownLatch(1); 149 CountDownLatch arriveGetRegionsOnServer = ARRIVE_GET_REGIONS_ON_SERVER; 150 RESUME_GET_REGIONS_ON_SERVER = new CountDownLatch(1); 151 152 Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn)); 153 arriveRegionOpening.await(); 154 155 // Kill the region server and trigger a SCP 156 UTIL.getMiniHBaseCluster().killRegionServer(sn); 157 // Wait until the SCP reaches the getRegionsOnServer call 158 arriveGetRegionsOnServer.await(); 159 RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster() 160 .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher(); 161 // this is necessary for making the UT stable, the problem here is that, in 162 // ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in 163 // another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it 164 // is still possible that the expireServer call has not been finished so the remote dispatcher 165 // still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will 166 // not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is 167 // not what we want to test in this UT so we need to wait here to prevent this from happening. 168 // See HBASE-27277 for more detailed analysis. 169 UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn)); 170 171 // Resume the TRSP, it should be able to finish 172 RESUME_REGION_OPENING.countDown(); 173 moveFuture.get(); 174 175 ProcedureExecutor<?> procExec = 176 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 177 long scpProcId = 178 procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) 179 .map(p -> (ServerCrashProcedure) p).findAny().get().getProcId(); 180 // Resume the SCP and make sure it can finish too 181 RESUME_GET_REGIONS_ON_SERVER.countDown(); 182 UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId)); 183 } 184}