001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.concurrent.CountDownLatch;
022import java.util.concurrent.TimeUnit;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.HBaseClassTestRule;
025import org.apache.hadoop.hbase.HBaseTestingUtility;
026import org.apache.hadoop.hbase.HConstants;
027import org.apache.hadoop.hbase.PleaseHoldException;
028import org.apache.hadoop.hbase.StartMiniClusterOption;
029import org.apache.hadoop.hbase.TableName;
030import org.apache.hadoop.hbase.client.RegionInfo;
031import org.apache.hadoop.hbase.master.HMaster;
032import org.apache.hadoop.hbase.master.MasterServices;
033import org.apache.hadoop.hbase.master.RegionPlan;
034import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
036import org.apache.hadoop.hbase.regionserver.HRegionServer;
037import org.apache.hadoop.hbase.testclassification.MasterTests;
038import org.apache.hadoop.hbase.testclassification.MediumTests;
039import org.apache.hadoop.hbase.util.Bytes;
040import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
041import org.apache.zookeeper.KeeperException;
042import org.junit.AfterClass;
043import org.junit.BeforeClass;
044import org.junit.ClassRule;
045import org.junit.Test;
046import org.junit.experimental.categories.Category;
047import org.slf4j.Logger;
048import org.slf4j.LoggerFactory;
049
050import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
051import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
052import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
053import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
054import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
055
056/**
057 * See HBASE-22060 and HBASE-22074 for more details.
058 */
059@Category({ MasterTests.class, MediumTests.class })
060public class TestOpenRegionProcedureHang {
061
062  @ClassRule
063  public static final HBaseClassTestRule CLASS_RULE =
064    HBaseClassTestRule.forClass(TestOpenRegionProcedureHang.class);
065
066  private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class);
067
068  private static CountDownLatch ARRIVE;
069  private static CountDownLatch RESUME;
070
071  private static CountDownLatch FINISH;
072
073  private static CountDownLatch ABORT;
074
075  private static final class AssignmentManagerForTest extends AssignmentManager {
076
077    public AssignmentManagerForTest(MasterServices master) {
078      super(master);
079    }
080
081    @Override
082    public ReportRegionStateTransitionResponse reportRegionStateTransition(
083        ReportRegionStateTransitionRequest req) throws PleaseHoldException {
084      RegionStateTransition transition = req.getTransition(0);
085      if (transition.getTransitionCode() == TransitionCode.OPENED &&
086        ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME) &&
087        ARRIVE != null) {
088        ARRIVE.countDown();
089        try {
090          RESUME.await();
091          RESUME = null;
092        } catch (InterruptedException e) {
093          throw new RuntimeException(e);
094        }
095        try {
096          return super.reportRegionStateTransition(req);
097        } finally {
098          FINISH.countDown();
099        }
100      } else {
101        return super.reportRegionStateTransition(req);
102      }
103    }
104  }
105
106  public static final class HMasterForTest extends HMaster {
107
108    public HMasterForTest(Configuration conf) throws IOException {
109      super(conf);
110    }
111
112    @Override
113    protected AssignmentManager createAssignmentManager(MasterServices master) {
114      return new AssignmentManagerForTest(master);
115    }
116
117    @Override
118    public void abort(String reason, Throwable cause) {
119      // hang here so we can finish the reportRegionStateTransition call, which is the most
120      // important part to reproduce the bug
121      if (ABORT != null) {
122        try {
123          ABORT.await();
124          ABORT = null;
125        } catch (InterruptedException e) {
126          throw new RuntimeException(e);
127        }
128      }
129      super.abort(reason, cause);
130    }
131  }
132
133  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
134
135  private static TableName NAME = TableName.valueOf("Open");
136
137  private static byte[] CF = Bytes.toBytes("cf");
138
139  @BeforeClass
140  public static void setUp() throws Exception {
141    Configuration conf = UTIL.getConfiguration();
142    conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class);
143
144    // make sure we do not timeout when caling reportRegionStateTransition
145    conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000);
146    conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000);
147    UTIL
148      .startMiniCluster(StartMiniClusterOption.builder().numMasters(2).numRegionServers(3).build());
149    UTIL.createTable(NAME, CF);
150    UTIL.waitTableAvailable(NAME);
151    UTIL.getAdmin().balancerSwitch(false, true);
152  }
153
154  @AfterClass
155  public static void tearDown() throws Exception {
156    UTIL.shutdownMiniCluster();
157  }
158
159  @Test
160  public void test() throws InterruptedException, KeeperException, IOException {
161    RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo();
162    AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager();
163
164    HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME);
165    HRegionServer rs2 = UTIL.getOtherRegionServer(rs1);
166
167    ARRIVE = new CountDownLatch(1);
168    RESUME = new CountDownLatch(1);
169    FINISH = new CountDownLatch(1);
170    ABORT = new CountDownLatch(1);
171    am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName()));
172
173    ARRIVE.await();
174    ARRIVE = null;
175    HMaster master = UTIL.getMiniHBaseCluster().getMaster();
176    master.getZooKeeper().close();
177    UTIL.waitFor(30000, () -> {
178      for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) {
179        if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) {
180          return mt.getMaster().isInitialized();
181        }
182      }
183      return false;
184    });
185    ProcedureExecutor<MasterProcedureEnv> procExec =
186      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
187    UTIL.waitFor(30000,
188      () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure)
189        .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME)));
190    OpenRegionProcedure proc = procExec.getProcedures().stream()
191      .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p)
192      .filter(p -> p.region.getTable().equals(NAME)).findFirst().get();
193    // wait a bit to let the OpenRegionProcedure send out the request
194    Thread.sleep(2000);
195    RESUME.countDown();
196    if (!FINISH.await(15, TimeUnit.SECONDS)) {
197      LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if" +
198        " we update the procedure store, as the WALProcedureStore" +
199        " will retry forever to roll the writer if it is not closed");
200    }
201    FINISH = null;
202    // if the reportRegionTransition is finished, wait a bit to let it return the data to RS
203    Thread.sleep(2000);
204    ABORT.countDown();
205
206    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId()));
207    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId()));
208  }
209}