001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.concurrent.CountDownLatch;
022import java.util.concurrent.TimeUnit;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.HBaseTestingUtil;
025import org.apache.hadoop.hbase.HConstants;
026import org.apache.hadoop.hbase.PleaseHoldException;
027import org.apache.hadoop.hbase.StartTestingClusterOption;
028import org.apache.hadoop.hbase.TableName;
029import org.apache.hadoop.hbase.client.RegionInfo;
030import org.apache.hadoop.hbase.master.HMaster;
031import org.apache.hadoop.hbase.master.MasterServices;
032import org.apache.hadoop.hbase.master.RegionPlan;
033import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
034import org.apache.hadoop.hbase.master.region.MasterRegion;
035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
036import org.apache.hadoop.hbase.regionserver.HRegionServer;
037import org.apache.hadoop.hbase.testclassification.MasterTests;
038import org.apache.hadoop.hbase.testclassification.MediumTests;
039import org.apache.hadoop.hbase.util.Bytes;
040import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
041import org.apache.zookeeper.KeeperException;
042import org.junit.jupiter.api.AfterAll;
043import org.junit.jupiter.api.BeforeAll;
044import org.junit.jupiter.api.Tag;
045import org.junit.jupiter.api.Test;
046import org.slf4j.Logger;
047import org.slf4j.LoggerFactory;
048
049import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
050import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
051import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
052import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
053import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
054
055/**
056 * See HBASE-22060 and HBASE-22074 for more details.
057 */
058@Tag(MasterTests.TAG)
059@Tag(MediumTests.TAG)
060public class TestOpenRegionProcedureHang {
061
062  private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class);
063
064  private static CountDownLatch ARRIVE;
065  private static CountDownLatch RESUME;
066
067  private static CountDownLatch FINISH;
068
069  private static CountDownLatch ABORT;
070
071  private static final class AssignmentManagerForTest extends AssignmentManager {
072
073    public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) {
074      super(master, masterRegion);
075    }
076
077    @Override
078    public ReportRegionStateTransitionResponse reportRegionStateTransition(
079      ReportRegionStateTransitionRequest req) throws PleaseHoldException {
080      RegionStateTransition transition = req.getTransition(0);
081      if (
082        transition.getTransitionCode() == TransitionCode.OPENED
083          && ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME)
084          && ARRIVE != null
085      ) {
086        ARRIVE.countDown();
087        try {
088          RESUME.await();
089          RESUME = null;
090        } catch (InterruptedException e) {
091          throw new RuntimeException(e);
092        }
093        try {
094          return super.reportRegionStateTransition(req);
095        } finally {
096          FINISH.countDown();
097        }
098      } else {
099        return super.reportRegionStateTransition(req);
100      }
101    }
102  }
103
104  public static final class HMasterForTest extends HMaster {
105
106    public HMasterForTest(Configuration conf) throws IOException {
107      super(conf);
108    }
109
110    @Override
111    protected AssignmentManager createAssignmentManager(MasterServices master,
112      MasterRegion masterRegion) {
113      return new AssignmentManagerForTest(master, masterRegion);
114    }
115
116    @Override
117    public void abort(String reason, Throwable cause) {
118      // hang here so we can finish the reportRegionStateTransition call, which is the most
119      // important part to reproduce the bug
120      if (ABORT != null) {
121        try {
122          ABORT.await();
123          ABORT = null;
124        } catch (InterruptedException e) {
125          throw new RuntimeException(e);
126        }
127      }
128      super.abort(reason, cause);
129    }
130  }
131
132  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
133
134  private static TableName NAME = TableName.valueOf("Open");
135
136  private static byte[] CF = Bytes.toBytes("cf");
137
138  @BeforeAll
139  public static void setUp() throws Exception {
140    Configuration conf = UTIL.getConfiguration();
141    conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class);
142
143    // make sure we do not timeout when caling reportRegionStateTransition
144    conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000);
145    conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000);
146    UTIL.startMiniCluster(
147      StartTestingClusterOption.builder().numMasters(2).numRegionServers(3).build());
148    UTIL.createTable(NAME, CF);
149    UTIL.waitTableAvailable(NAME);
150    UTIL.getAdmin().balancerSwitch(false, true);
151  }
152
153  @AfterAll
154  public static void tearDown() throws Exception {
155    UTIL.shutdownMiniCluster();
156  }
157
158  @Test
159  public void test() throws InterruptedException, KeeperException, IOException {
160    RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo();
161    AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager();
162
163    HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME);
164    HRegionServer rs2 = UTIL.getOtherRegionServer(rs1);
165
166    ARRIVE = new CountDownLatch(1);
167    RESUME = new CountDownLatch(1);
168    FINISH = new CountDownLatch(1);
169    ABORT = new CountDownLatch(1);
170    am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName()));
171
172    ARRIVE.await();
173    ARRIVE = null;
174    HMaster master = UTIL.getMiniHBaseCluster().getMaster();
175    master.getZooKeeper().close();
176    UTIL.waitFor(30000, () -> {
177      for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) {
178        if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) {
179          return mt.getMaster().isInitialized();
180        }
181      }
182      return false;
183    });
184    ProcedureExecutor<MasterProcedureEnv> procExec =
185      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
186    UTIL.waitFor(30000,
187      () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure)
188        .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME)));
189    OpenRegionProcedure proc = procExec.getProcedures().stream()
190      .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p)
191      .filter(p -> p.region.getTable().equals(NAME)).findFirst().get();
192    // wait a bit to let the OpenRegionProcedure send out the request
193    Thread.sleep(2000);
194    RESUME.countDown();
195    if (!FINISH.await(15, TimeUnit.SECONDS)) {
196      LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if"
197        + " we update the procedure store, as the WALProcedureStore"
198        + " will retry forever to roll the writer if it is not closed");
199    }
200    FINISH = null;
201    // if the reportRegionTransition is finished, wait a bit to let it return the data to RS
202    Thread.sleep(2000);
203    ABORT.countDown();
204
205    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId()));
206    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId()));
207  }
208}