001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.concurrent.CountDownLatch; 022import java.util.concurrent.TimeUnit; 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.hbase.HBaseClassTestRule; 025import org.apache.hadoop.hbase.HBaseTestingUtility; 026import org.apache.hadoop.hbase.HConstants; 027import org.apache.hadoop.hbase.PleaseHoldException; 028import org.apache.hadoop.hbase.StartMiniClusterOption; 029import org.apache.hadoop.hbase.TableName; 030import org.apache.hadoop.hbase.client.RegionInfo; 031import org.apache.hadoop.hbase.master.HMaster; 032import org.apache.hadoop.hbase.master.MasterServices; 033import org.apache.hadoop.hbase.master.RegionPlan; 034import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 036import org.apache.hadoop.hbase.regionserver.HRegionServer; 037import org.apache.hadoop.hbase.testclassification.MasterTests; 038import org.apache.hadoop.hbase.testclassification.MediumTests; 039import org.apache.hadoop.hbase.util.Bytes; 040import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 041import org.apache.zookeeper.KeeperException; 042import org.junit.AfterClass; 043import org.junit.BeforeClass; 044import org.junit.ClassRule; 045import org.junit.Test; 046import org.junit.experimental.categories.Category; 047import org.slf4j.Logger; 048import org.slf4j.LoggerFactory; 049 050import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 051import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; 052import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; 053import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; 054import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; 055 056/** 057 * See HBASE-22060 and HBASE-22074 for more details. 058 */ 059@Category({ MasterTests.class, MediumTests.class }) 060public class TestOpenRegionProcedureHang { 061 062 @ClassRule 063 public static final HBaseClassTestRule CLASS_RULE = 064 HBaseClassTestRule.forClass(TestOpenRegionProcedureHang.class); 065 066 private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class); 067 068 private static CountDownLatch ARRIVE; 069 private static CountDownLatch RESUME; 070 071 private static CountDownLatch FINISH; 072 073 private static CountDownLatch ABORT; 074 075 private static final class AssignmentManagerForTest extends AssignmentManager { 076 077 public AssignmentManagerForTest(MasterServices master) { 078 super(master); 079 } 080 081 @Override 082 public ReportRegionStateTransitionResponse reportRegionStateTransition( 083 ReportRegionStateTransitionRequest req) throws PleaseHoldException { 084 RegionStateTransition transition = req.getTransition(0); 085 if (transition.getTransitionCode() == TransitionCode.OPENED && 086 ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME) && 087 ARRIVE != null) { 088 ARRIVE.countDown(); 089 try { 090 RESUME.await(); 091 RESUME = null; 092 } catch (InterruptedException e) { 093 throw new RuntimeException(e); 094 } 095 try { 096 return super.reportRegionStateTransition(req); 097 } finally { 098 FINISH.countDown(); 099 } 100 } else { 101 return super.reportRegionStateTransition(req); 102 } 103 } 104 } 105 106 public static final class HMasterForTest extends HMaster { 107 108 public HMasterForTest(Configuration conf) throws IOException { 109 super(conf); 110 } 111 112 @Override 113 protected AssignmentManager createAssignmentManager(MasterServices master) { 114 return new AssignmentManagerForTest(master); 115 } 116 117 @Override 118 public void abort(String reason, Throwable cause) { 119 // hang here so we can finish the reportRegionStateTransition call, which is the most 120 // important part to reproduce the bug 121 if (ABORT != null) { 122 try { 123 ABORT.await(); 124 ABORT = null; 125 } catch (InterruptedException e) { 126 throw new RuntimeException(e); 127 } 128 } 129 super.abort(reason, cause); 130 } 131 } 132 133 private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); 134 135 private static TableName NAME = TableName.valueOf("Open"); 136 137 private static byte[] CF = Bytes.toBytes("cf"); 138 139 @BeforeClass 140 public static void setUp() throws Exception { 141 Configuration conf = UTIL.getConfiguration(); 142 conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); 143 144 // make sure we do not timeout when caling reportRegionStateTransition 145 conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000); 146 conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000); 147 UTIL 148 .startMiniCluster(StartMiniClusterOption.builder().numMasters(2).numRegionServers(3).build()); 149 UTIL.createTable(NAME, CF); 150 UTIL.waitTableAvailable(NAME); 151 UTIL.getAdmin().balancerSwitch(false, true); 152 } 153 154 @AfterClass 155 public static void tearDown() throws Exception { 156 UTIL.shutdownMiniCluster(); 157 } 158 159 @Test 160 public void test() throws InterruptedException, KeeperException, IOException { 161 RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); 162 AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); 163 164 HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME); 165 HRegionServer rs2 = UTIL.getOtherRegionServer(rs1); 166 167 ARRIVE = new CountDownLatch(1); 168 RESUME = new CountDownLatch(1); 169 FINISH = new CountDownLatch(1); 170 ABORT = new CountDownLatch(1); 171 am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName())); 172 173 ARRIVE.await(); 174 ARRIVE = null; 175 HMaster master = UTIL.getMiniHBaseCluster().getMaster(); 176 master.getZooKeeper().close(); 177 UTIL.waitFor(30000, () -> { 178 for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) { 179 if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) { 180 return mt.getMaster().isInitialized(); 181 } 182 } 183 return false; 184 }); 185 ProcedureExecutor<MasterProcedureEnv> procExec = 186 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 187 UTIL.waitFor(30000, 188 () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure) 189 .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME))); 190 OpenRegionProcedure proc = procExec.getProcedures().stream() 191 .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p) 192 .filter(p -> p.region.getTable().equals(NAME)).findFirst().get(); 193 // wait a bit to let the OpenRegionProcedure send out the request 194 Thread.sleep(2000); 195 RESUME.countDown(); 196 if (!FINISH.await(15, TimeUnit.SECONDS)) { 197 LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if" + 198 " we update the procedure store, as the WALProcedureStore" + 199 " will retry forever to roll the writer if it is not closed"); 200 } 201 FINISH = null; 202 // if the reportRegionTransition is finished, wait a bit to let it return the data to RS 203 Thread.sleep(2000); 204 ABORT.countDown(); 205 206 UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId())); 207 UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId())); 208 } 209}