001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.List;
022import java.util.concurrent.CompletableFuture;
023import java.util.concurrent.CountDownLatch;
024import java.util.concurrent.Future;
025import org.apache.hadoop.conf.Configuration;
026import org.apache.hadoop.hbase.HBaseClassTestRule;
027import org.apache.hadoop.hbase.HBaseTestingUtil;
028import org.apache.hadoop.hbase.HConstants;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.TableName;
031import org.apache.hadoop.hbase.client.RegionInfo;
032import org.apache.hadoop.hbase.master.HMaster;
033import org.apache.hadoop.hbase.master.MasterServices;
034import org.apache.hadoop.hbase.master.RegionPlan;
035import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
036import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
037import org.apache.hadoop.hbase.master.region.MasterRegion;
038import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
039import org.apache.hadoop.hbase.testclassification.LargeTests;
040import org.apache.hadoop.hbase.testclassification.MasterTests;
041import org.apache.hadoop.hbase.util.Bytes;
042import org.apache.hadoop.hbase.util.FutureUtils;
043import org.apache.zookeeper.KeeperException;
044import org.junit.AfterClass;
045import org.junit.BeforeClass;
046import org.junit.ClassRule;
047import org.junit.Test;
048import org.junit.experimental.categories.Category;
049
050/**
051 * Testcase for HBASE-23594.
052 */
053@Category({ MasterTests.class, LargeTests.class })
054public class TestRaceBetweenSCPAndTRSP {
055
056  @ClassRule
057  public static final HBaseClassTestRule CLASS_RULE =
058    HBaseClassTestRule.forClass(TestRaceBetweenSCPAndTRSP.class);
059
060  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
061
062  private static TableName NAME = TableName.valueOf("Race");
063
064  private static byte[] CF = Bytes.toBytes("cf");
065
066  private static CountDownLatch ARRIVE_REGION_OPENING;
067
068  private static CountDownLatch RESUME_REGION_OPENING;
069
070  private static CountDownLatch ARRIVE_GET_REGIONS_ON_SERVER;
071
072  private static CountDownLatch RESUME_GET_REGIONS_ON_SERVER;
073
074  private static final class AssignmentManagerForTest extends AssignmentManager {
075
076    public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) {
077      super(master, masterRegion);
078    }
079
080    @Override
081    CompletableFuture<Void> regionOpening(RegionStateNode regionNode) {
082      CompletableFuture<Void> future = super.regionOpening(regionNode);
083      try {
084        // wait until the operation done, then trigger later processing, to make the test more
085        // stable
086        FutureUtils.get(future);
087      } catch (IOException e) {
088      }
089      if (regionNode.getRegionInfo().getTable().equals(NAME) && ARRIVE_REGION_OPENING != null) {
090        ARRIVE_REGION_OPENING.countDown();
091        ARRIVE_REGION_OPENING = null;
092        try {
093          RESUME_REGION_OPENING.await();
094        } catch (InterruptedException e) {
095        }
096      }
097      return future;
098    }
099
100    @Override
101    public List<RegionInfo> getRegionsOnServer(ServerName serverName) {
102      List<RegionInfo> regions = super.getRegionsOnServer(serverName);
103      if (ARRIVE_GET_REGIONS_ON_SERVER != null) {
104        ARRIVE_GET_REGIONS_ON_SERVER.countDown();
105        ARRIVE_GET_REGIONS_ON_SERVER = null;
106        try {
107          RESUME_GET_REGIONS_ON_SERVER.await();
108        } catch (InterruptedException e) {
109        }
110      }
111      return regions;
112    }
113  }
114
115  public static final class HMasterForTest extends HMaster {
116
117    public HMasterForTest(Configuration conf) throws IOException, KeeperException {
118      super(conf);
119    }
120
121    @Override
122    protected AssignmentManager createAssignmentManager(MasterServices master,
123      MasterRegion masterRegion) {
124      return new AssignmentManagerForTest(master, masterRegion);
125    }
126  }
127
128  @BeforeClass
129  public static void setUp() throws Exception {
130    UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class);
131    UTIL.startMiniCluster(2);
132    UTIL.createTable(NAME, CF);
133    UTIL.waitTableAvailable(NAME);
134    UTIL.getAdmin().balancerSwitch(false, true);
135  }
136
137  @AfterClass
138  public static void tearDown() throws Exception {
139    UTIL.shutdownMiniCluster();
140  }
141
142  @Test
143  public void test() throws Exception {
144    RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo();
145    AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager();
146    ServerName sn = am.getRegionStates().getRegionState(region).getServerName();
147
148    // Assign the CountDownLatches that get nulled in background threads else we NPE checking
149    // the static.
150    ARRIVE_REGION_OPENING = new CountDownLatch(1);
151    CountDownLatch arriveRegionOpening = ARRIVE_REGION_OPENING;
152    RESUME_REGION_OPENING = new CountDownLatch(1);
153    ARRIVE_GET_REGIONS_ON_SERVER = new CountDownLatch(1);
154    CountDownLatch arriveGetRegionsOnServer = ARRIVE_GET_REGIONS_ON_SERVER;
155    RESUME_GET_REGIONS_ON_SERVER = new CountDownLatch(1);
156
157    Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn));
158    arriveRegionOpening.await();
159
160    // Kill the region server and trigger a SCP
161    UTIL.getMiniHBaseCluster().killRegionServer(sn);
162    // Wait until the SCP reaches the getRegionsOnServer call
163    arriveGetRegionsOnServer.await();
164    RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster()
165      .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher();
166    // this is necessary for making the UT stable, the problem here is that, in
167    // ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in
168    // another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it
169    // is still possible that the expireServer call has not been finished so the remote dispatcher
170    // still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will
171    // not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is
172    // not what we want to test in this UT so we need to wait here to prevent this from happening.
173    // See HBASE-27277 for more detailed analysis.
174    UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn));
175
176    // Resume the TRSP, it should be able to finish
177    RESUME_REGION_OPENING.countDown();
178    moveFuture.get();
179
180    ProcedureExecutor<?> procExec =
181      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
182    long scpProcId =
183      procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)
184        .map(p -> (ServerCrashProcedure) p).findAny().get().getProcId();
185    // Resume the SCP and make sure it can finish too
186    RESUME_GET_REGIONS_ON_SERVER.countDown();
187    UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId));
188  }
189}