001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.concurrent.CountDownLatch;
022import java.util.concurrent.TimeUnit;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.HBaseClassTestRule;
025import org.apache.hadoop.hbase.HBaseTestingUtil;
026import org.apache.hadoop.hbase.HConstants;
027import org.apache.hadoop.hbase.PleaseHoldException;
028import org.apache.hadoop.hbase.StartTestingClusterOption;
029import org.apache.hadoop.hbase.TableName;
030import org.apache.hadoop.hbase.client.RegionInfo;
031import org.apache.hadoop.hbase.master.HMaster;
032import org.apache.hadoop.hbase.master.MasterServices;
033import org.apache.hadoop.hbase.master.RegionPlan;
034import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
035import org.apache.hadoop.hbase.master.region.MasterRegion;
036import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
037import org.apache.hadoop.hbase.regionserver.HRegionServer;
038import org.apache.hadoop.hbase.testclassification.MasterTests;
039import org.apache.hadoop.hbase.testclassification.MediumTests;
040import org.apache.hadoop.hbase.util.Bytes;
041import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
042import org.apache.zookeeper.KeeperException;
043import org.junit.AfterClass;
044import org.junit.BeforeClass;
045import org.junit.ClassRule;
046import org.junit.Test;
047import org.junit.experimental.categories.Category;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
052import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
053import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
054import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
055import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
056
057/**
058 * See HBASE-22060 and HBASE-22074 for more details.
059 */
060@Category({ MasterTests.class, MediumTests.class })
061public class TestOpenRegionProcedureHang {
062
063  @ClassRule
064  public static final HBaseClassTestRule CLASS_RULE =
065    HBaseClassTestRule.forClass(TestOpenRegionProcedureHang.class);
066
067  private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class);
068
069  private static CountDownLatch ARRIVE;
070  private static CountDownLatch RESUME;
071
072  private static CountDownLatch FINISH;
073
074  private static CountDownLatch ABORT;
075
076  private static final class AssignmentManagerForTest extends AssignmentManager {
077
078    public AssignmentManagerForTest(MasterServices master,MasterRegion masterRegion) {
079      super(master, masterRegion);
080    }
081
082    @Override
083    public ReportRegionStateTransitionResponse reportRegionStateTransition(
084        ReportRegionStateTransitionRequest req) throws PleaseHoldException {
085      RegionStateTransition transition = req.getTransition(0);
086      if (transition.getTransitionCode() == TransitionCode.OPENED &&
087        ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME) &&
088        ARRIVE != null) {
089        ARRIVE.countDown();
090        try {
091          RESUME.await();
092          RESUME = null;
093        } catch (InterruptedException e) {
094          throw new RuntimeException(e);
095        }
096        try {
097          return super.reportRegionStateTransition(req);
098        } finally {
099          FINISH.countDown();
100        }
101      } else {
102        return super.reportRegionStateTransition(req);
103      }
104    }
105  }
106
107  public static final class HMasterForTest extends HMaster {
108
109    public HMasterForTest(Configuration conf) throws IOException {
110      super(conf);
111    }
112
113    @Override
114    protected AssignmentManager createAssignmentManager(MasterServices master,
115      MasterRegion masterRegion) {
116      return new AssignmentManagerForTest(master, masterRegion);
117    }
118
119    @Override
120    public void abort(String reason, Throwable cause) {
121      // hang here so we can finish the reportRegionStateTransition call, which is the most
122      // important part to reproduce the bug
123      if (ABORT != null) {
124        try {
125          ABORT.await();
126          ABORT = null;
127        } catch (InterruptedException e) {
128          throw new RuntimeException(e);
129        }
130      }
131      super.abort(reason, cause);
132    }
133  }
134
135  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
136
137  private static TableName NAME = TableName.valueOf("Open");
138
139  private static byte[] CF = Bytes.toBytes("cf");
140
141  @BeforeClass
142  public static void setUp() throws Exception {
143    Configuration conf = UTIL.getConfiguration();
144    conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class);
145
146    // make sure we do not timeout when caling reportRegionStateTransition
147    conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000);
148    conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000);
149    UTIL.startMiniCluster(
150      StartTestingClusterOption.builder().numMasters(2).numRegionServers(3).build());
151    UTIL.createTable(NAME, CF);
152    UTIL.waitTableAvailable(NAME);
153    UTIL.getAdmin().balancerSwitch(false, true);
154  }
155
156  @AfterClass
157  public static void tearDown() throws Exception {
158    UTIL.shutdownMiniCluster();
159  }
160
161  @Test
162  public void test() throws InterruptedException, KeeperException, IOException {
163    RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo();
164    AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager();
165
166    HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME);
167    HRegionServer rs2 = UTIL.getOtherRegionServer(rs1);
168
169    ARRIVE = new CountDownLatch(1);
170    RESUME = new CountDownLatch(1);
171    FINISH = new CountDownLatch(1);
172    ABORT = new CountDownLatch(1);
173    am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName()));
174
175    ARRIVE.await();
176    ARRIVE = null;
177    HMaster master = UTIL.getMiniHBaseCluster().getMaster();
178    master.getZooKeeper().close();
179    UTIL.waitFor(30000, () -> {
180      for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) {
181        if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) {
182          return mt.getMaster().isInitialized();
183        }
184      }
185      return false;
186    });
187    ProcedureExecutor<MasterProcedureEnv> procExec =
188      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
189    UTIL.waitFor(30000,
190      () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure)
191        .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME)));
192    OpenRegionProcedure proc = procExec.getProcedures().stream()
193      .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p)
194      .filter(p -> p.region.getTable().equals(NAME)).findFirst().get();
195    // wait a bit to let the OpenRegionProcedure send out the request
196    Thread.sleep(2000);
197    RESUME.countDown();
198    if (!FINISH.await(15, TimeUnit.SECONDS)) {
199      LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if" +
200        " we update the procedure store, as the WALProcedureStore" +
201        " will retry forever to roll the writer if it is not closed");
202    }
203    FINISH = null;
204    // if the reportRegionTransition is finished, wait a bit to let it return the data to RS
205    Thread.sleep(2000);
206    ABORT.countDown();
207
208    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId()));
209    UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId()));
210  }
211}