001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.concurrent.CountDownLatch;
022import java.util.concurrent.TimeUnit;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.HBaseClassTestRule;
025import org.apache.hadoop.hbase.HBaseTestingUtil;
026import org.apache.hadoop.hbase.HConstants;
027import org.apache.hadoop.hbase.PleaseHoldException;
028import org.apache.hadoop.hbase.StartTestingClusterOption;
029import org.apache.hadoop.hbase.TableName;
030import org.apache.hadoop.hbase.client.RegionInfo;
031import org.apache.hadoop.hbase.master.HMaster;
032import org.apache.hadoop.hbase.master.MasterServices;
033import org.apache.hadoop.hbase.master.RegionPlan;
034import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
035import org.apache.hadoop.hbase.master.region.MasterRegion;
036import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
037import org.apache.hadoop.hbase.regionserver.HRegionServer;
038import org.apache.hadoop.hbase.testclassification.MasterTests;
039import org.apache.hadoop.hbase.testclassification.MediumTests;
040import org.apache.hadoop.hbase.util.Bytes;
041import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
042import org.apache.zookeeper.KeeperException;
043import org.junit.AfterClass;
044import org.junit.BeforeClass;
045import org.junit.ClassRule;
046import org.junit.Test;
047import org.junit.experimental.categories.Category;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
052import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
053import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
054import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
055import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
056
057/**
058 * See HBASE-22060 and HBASE-22074 for more details.
059 */
060@Category({ MasterTests.class, MediumTests.class })
061public class TestOpenRegionProcedureHang {
062
063  @ClassRule
064  public static final HBaseClassTestRule CLASS_RULE =
065    HBaseClassTestRule.forClass(TestOpenRegionProcedureHang.class);
066
067  private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class);
068
069  private static CountDownLatch ARRIVE;
070  private static CountDownLatch RESUME;
071
072  private static CountDownLatch FINISH;
073
074  private static CountDownLatch ABORT;
075
076  private static final class AssignmentManagerForTest extends AssignmentManager {
077
078    public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) {
079      super(master, masterRegion);
080    }
081
082    @Override
083    public ReportRegionStateTransitionResponse reportRegionStateTransition(
084      ReportRegionStateTransitionRequest req) throws PleaseHoldException {
085      RegionStateTransition transition = req.getTransition(0);
086      if (
087        transition.getTransitionCode() == TransitionCode.OPENED
088          && ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME)
089          && ARRIVE != null
090      ) {
091        ARRIVE.countDown();
092        try {
093          RESUME.await();
094          RESUME = null;
095        } catch (InterruptedException e) {
096          throw new RuntimeException(e);
097        }
098        try {
099          return super.reportRegionStateTransition(req);
100        } finally {
101          FINISH.countDown();
102        }
103      } else {
104        return super.reportRegionStateTransition(req);
105      }
106    }
107  }
108
109  public static final class HMasterForTest extends HMaster {
110
111    public HMasterForTest(Configuration conf) throws IOException {
112      super(conf);
113    }
114
115    @Override
116    protected AssignmentManager createAssignmentManager(MasterServices master,
117      MasterRegion masterRegion) {
118      return new AssignmentManagerForTest(master, masterRegion);
119    }
120
121    @Override
122    public void abort(String reason, Throwable cause) {
123      // hang here so we can finish the reportRegionStateTransition call, which is the most
124      // important part to reproduce the bug
125      if (ABORT != null) {
126        try {
127          ABORT.await();
128          ABORT = null;
129        } catch (InterruptedException e) {
130          throw new RuntimeException(e);
131        }
132      }
133      super.abort(reason, cause);
134    }
135  }
136
137  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
138
139  private static TableName NAME = TableName.valueOf("Open");
140
141  private static byte[] CF = Bytes.toBytes("cf");
142
143  @BeforeClass
144  public static void setUp() throws Exception {
145    Configuration conf = UTIL.getConfiguration();
146    conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class);
147
148    // make sure we do not timeout when caling reportRegionStateTransition
149    conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000);
150    conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000);
151    UTIL.startMiniCluster(
152      StartTestingClusterOption.builder().numMasters(2).numRegionServers(3).build());
153    UTIL.createTable(NAME, CF);
154    UTIL.waitTableAvailable(NAME);
155    UTIL.getAdmin().balancerSwitch(false, true);
156  }
157
158  @AfterClass
159  public static void tearDown() throws Exception {
160    UTIL.shutdownMiniCluster();
161  }
162
163  @Test
164  public void test() throws InterruptedException, KeeperException, IOException {
165    RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo();
166    AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager();
167
168    HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME);
169    HRegionServer rs2 = UTIL.getOtherRegionServer(rs1);
170
171    ARRIVE = new CountDownLatch(1);
172    RESUME = new CountDownLatch(1);
173    FINISH = new CountDownLatch(1);
174    ABORT = new CountDownLatch(1);
175    am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName()));
176
177    ARRIVE.await();
178    ARRIVE = null;
179    HMaster master = UTIL.getMiniHBaseCluster().getMaster();
180    master.getZooKeeper().close();
181    UTIL.waitFor(30000, () -> {
182      for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) {
183        if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) {
184          return mt.getMaster().isInitialized();
185        }
186      }
187      return false;
188    });
189    ProcedureExecutor<MasterProcedureEnv> procExec =
190      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
191    UTIL.waitFor(30000,
192      () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure)
193        .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME)));
194    OpenRegionProcedure proc = procExec.getProcedures().stream()
195      .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p)
196      .filter(p -> p.region.getTable().equals(NAME)).findFirst().get();
197    // wait a bit to let the OpenRegionProcedure send out the request
198    Thread.sleep(2000);
199    RESUME.countDown();
200    if (!FINISH.await(15, TimeUnit.SECONDS)) {
201      LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if"
202        + " we update the procedure store, as the WALProcedureStore"
203        + " will retry forever to roll the writer if it is not closed");
204    }
205    FINISH = null;
206    // if the reportRegionTransition is finished, wait a bit to let it return the data to RS
207    Thread.sleep(2000);
208    ABORT.countDown();
209
210    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId()));
211    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId()));
212  }
213}