001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.concurrent.CountDownLatch; 022import java.util.concurrent.TimeUnit; 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.hbase.HBaseTestingUtil; 025import org.apache.hadoop.hbase.HConstants; 026import org.apache.hadoop.hbase.PleaseHoldException; 027import org.apache.hadoop.hbase.StartTestingClusterOption; 028import org.apache.hadoop.hbase.TableName; 029import org.apache.hadoop.hbase.client.RegionInfo; 030import org.apache.hadoop.hbase.master.HMaster; 031import org.apache.hadoop.hbase.master.MasterServices; 032import org.apache.hadoop.hbase.master.RegionPlan; 033import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 034import org.apache.hadoop.hbase.master.region.MasterRegion; 035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 036import org.apache.hadoop.hbase.regionserver.HRegionServer; 037import org.apache.hadoop.hbase.testclassification.MasterTests; 038import org.apache.hadoop.hbase.testclassification.MediumTests; 039import org.apache.hadoop.hbase.util.Bytes; 040import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 041import org.apache.zookeeper.KeeperException; 042import org.junit.jupiter.api.AfterAll; 043import org.junit.jupiter.api.BeforeAll; 044import org.junit.jupiter.api.Tag; 045import org.junit.jupiter.api.Test; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 050import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; 051import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; 052import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; 053import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; 054 055/** 056 * See HBASE-22060 and HBASE-22074 for more details. 057 */ 058@Tag(MasterTests.TAG) 059@Tag(MediumTests.TAG) 060public class TestOpenRegionProcedureHang { 061 062 private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class); 063 064 private static CountDownLatch ARRIVE; 065 private static CountDownLatch RESUME; 066 067 private static CountDownLatch FINISH; 068 069 private static CountDownLatch ABORT; 070 071 private static final class AssignmentManagerForTest extends AssignmentManager { 072 073 public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) { 074 super(master, masterRegion); 075 } 076 077 @Override 078 public ReportRegionStateTransitionResponse reportRegionStateTransition( 079 ReportRegionStateTransitionRequest req) throws PleaseHoldException { 080 RegionStateTransition transition = req.getTransition(0); 081 if ( 082 transition.getTransitionCode() == TransitionCode.OPENED 083 && ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME) 084 && ARRIVE != null 085 ) { 086 ARRIVE.countDown(); 087 try { 088 RESUME.await(); 089 RESUME = null; 090 } catch (InterruptedException e) { 091 throw new RuntimeException(e); 092 } 093 try { 094 return super.reportRegionStateTransition(req); 095 } finally { 096 FINISH.countDown(); 097 } 098 } else { 099 return super.reportRegionStateTransition(req); 100 } 101 } 102 } 103 104 public static final class HMasterForTest extends HMaster { 105 106 public HMasterForTest(Configuration conf) throws IOException { 107 super(conf); 108 } 109 110 @Override 111 protected AssignmentManager createAssignmentManager(MasterServices master, 112 MasterRegion masterRegion) { 113 return new AssignmentManagerForTest(master, masterRegion); 114 } 115 116 @Override 117 public void abort(String reason, Throwable cause) { 118 // hang here so we can finish the reportRegionStateTransition call, which is the most 119 // important part to reproduce the bug 120 if (ABORT != null) { 121 try { 122 ABORT.await(); 123 ABORT = null; 124 } catch (InterruptedException e) { 125 throw new RuntimeException(e); 126 } 127 } 128 super.abort(reason, cause); 129 } 130 } 131 132 private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 133 134 private static TableName NAME = TableName.valueOf("Open"); 135 136 private static byte[] CF = Bytes.toBytes("cf"); 137 138 @BeforeAll 139 public static void setUp() throws Exception { 140 Configuration conf = UTIL.getConfiguration(); 141 conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); 142 143 // make sure we do not timeout when caling reportRegionStateTransition 144 conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000); 145 conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000); 146 UTIL.startMiniCluster( 147 StartTestingClusterOption.builder().numMasters(2).numRegionServers(3).build()); 148 UTIL.createTable(NAME, CF); 149 UTIL.waitTableAvailable(NAME); 150 UTIL.getAdmin().balancerSwitch(false, true); 151 } 152 153 @AfterAll 154 public static void tearDown() throws Exception { 155 UTIL.shutdownMiniCluster(); 156 } 157 158 @Test 159 public void test() throws InterruptedException, KeeperException, IOException { 160 RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); 161 AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); 162 163 HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME); 164 HRegionServer rs2 = UTIL.getOtherRegionServer(rs1); 165 166 ARRIVE = new CountDownLatch(1); 167 RESUME = new CountDownLatch(1); 168 FINISH = new CountDownLatch(1); 169 ABORT = new CountDownLatch(1); 170 am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName())); 171 172 ARRIVE.await(); 173 ARRIVE = null; 174 HMaster master = UTIL.getMiniHBaseCluster().getMaster(); 175 master.getZooKeeper().close(); 176 UTIL.waitFor(30000, () -> { 177 for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) { 178 if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) { 179 return mt.getMaster().isInitialized(); 180 } 181 } 182 return false; 183 }); 184 ProcedureExecutor<MasterProcedureEnv> procExec = 185 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 186 UTIL.waitFor(30000, 187 () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure) 188 .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME))); 189 OpenRegionProcedure proc = procExec.getProcedures().stream() 190 .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p) 191 .filter(p -> p.region.getTable().equals(NAME)).findFirst().get(); 192 // wait a bit to let the OpenRegionProcedure send out the request 193 Thread.sleep(2000); 194 RESUME.countDown(); 195 if (!FINISH.await(15, TimeUnit.SECONDS)) { 196 LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if" 197 + " we update the procedure store, as the WALProcedureStore" 198 + " will retry forever to roll the writer if it is not closed"); 199 } 200 FINISH = null; 201 // if the reportRegionTransition is finished, wait a bit to let it return the data to RS 202 Thread.sleep(2000); 203 ABORT.countDown(); 204 205 UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId())); 206 UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId())); 207 } 208}