001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.concurrent.CountDownLatch;
022import org.apache.hadoop.hbase.HBaseTestingUtil;
023import org.apache.hadoop.hbase.ProcedureTestUtil;
024import org.apache.hadoop.hbase.ServerName;
025import org.apache.hadoop.hbase.TableName;
026import org.apache.hadoop.hbase.client.Put;
027import org.apache.hadoop.hbase.client.RegionInfo;
028import org.apache.hadoop.hbase.client.Table;
029import org.apache.hadoop.hbase.master.HMaster;
030import org.apache.hadoop.hbase.master.ServerManager;
031import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
032import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
033import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
034import org.apache.hadoop.hbase.procedure2.Procedure;
035import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
036import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
037import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
038import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
039import org.apache.hadoop.hbase.regionserver.HRegionServer;
040import org.apache.hadoop.hbase.testclassification.LargeTests;
041import org.apache.hadoop.hbase.testclassification.MasterTests;
042import org.apache.hadoop.hbase.util.Bytes;
043import org.junit.jupiter.api.AfterAll;
044import org.junit.jupiter.api.BeforeAll;
045import org.junit.jupiter.api.Tag;
046import org.junit.jupiter.api.Test;
047import org.slf4j.Logger;
048import org.slf4j.LoggerFactory;
049
050/**
051 * Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the
052 * CPUs.
053 */
054@Tag(MasterTests.TAG)
055@Tag(LargeTests.TAG)
056public class TestCloseRegionWhileRSCrash {
057  private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class);
058
059  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
060
061  private static TableName TABLE_NAME = TableName.valueOf("Backoff");
062
063  private static byte[] CF = Bytes.toBytes("cf");
064
065  private static CountDownLatch ARRIVE = new CountDownLatch(1);
066
067  private static CountDownLatch RESUME = new CountDownLatch(1);
068
069  public static final class DummyServerProcedure extends Procedure<MasterProcedureEnv>
070    implements ServerProcedureInterface {
071
072    private ServerName serverName;
073
074    public DummyServerProcedure() {
075    }
076
077    public DummyServerProcedure(ServerName serverName) {
078      this.serverName = serverName;
079    }
080
081    @Override
082    public ServerName getServerName() {
083      return serverName;
084    }
085
086    @Override
087    public boolean hasMetaTableRegion() {
088      return false;
089    }
090
091    @Override
092    public ServerOperationType getServerOperationType() {
093      return ServerOperationType.CRASH_HANDLER;
094    }
095
096    @Override
097    protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
098      throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
099      ARRIVE.countDown();
100      RESUME.await();
101      return null;
102    }
103
104    @Override
105    protected LockState acquireLock(final MasterProcedureEnv env) {
106      if (env.getProcedureScheduler().waitServerExclusiveLock(this, getServerName())) {
107        return LockState.LOCK_EVENT_WAIT;
108      }
109      return LockState.LOCK_ACQUIRED;
110    }
111
112    @Override
113    protected void releaseLock(final MasterProcedureEnv env) {
114      env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName());
115    }
116
117    @Override
118    protected boolean holdLock(MasterProcedureEnv env) {
119      return true;
120    }
121
122    @Override
123    protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException {
124    }
125
126    @Override
127    protected boolean abort(MasterProcedureEnv env) {
128      return false;
129    }
130
131    @Override
132    protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
133
134    }
135
136    @Override
137    protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
138    }
139  }
140
141  @BeforeAll
142  public static void setUp() throws Exception {
143    UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
144    UTIL.startMiniCluster(3);
145    UTIL.createTable(TABLE_NAME, CF);
146    UTIL.getAdmin().balancerSwitch(false, true);
147    HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
148    if (!srcRs.getRegions(TableName.META_TABLE_NAME).isEmpty()) {
149      RegionInfo metaRegion = srcRs.getRegions(TableName.META_TABLE_NAME).get(0).getRegionInfo();
150      HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
151      UTIL.getAdmin().move(metaRegion.getEncodedNameAsBytes(), dstRs.getServerName());
152      UTIL.waitFor(30000, () -> !dstRs.getRegions(TableName.META_TABLE_NAME).isEmpty());
153    }
154  }
155
156  @AfterAll
157  public static void tearDown() throws Exception {
158    UTIL.shutdownMiniCluster();
159  }
160
161  @Test
162  public void testRetryBackoff() throws IOException, InterruptedException {
163    HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
164    RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo();
165    HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
166    ProcedureExecutor<MasterProcedureEnv> procExec =
167      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
168    procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
169    ARRIVE.await();
170    UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName());
171    UTIL.waitFor(30000,
172      () -> procExec.getProcedures().stream().anyMatch(p -> p instanceof ServerCrashProcedure));
173    Thread t = new Thread(() -> {
174      try {
175        UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName());
176      } catch (IOException e) {
177        LOG.info("Failed move of {}", region.getRegionNameAsString(), e);
178      }
179    });
180    t.start();
181    // wait until we enter the WAITING_TIMEOUT state
182    ProcedureTestUtil.waitUntilProcedureWaitingTimeout(UTIL, TransitRegionStateProcedure.class,
183      30000);
184    // wait until the timeout value increase three times
185    ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3);
186    // close connection to make sure that we can not finish the TRSP
187    final HMaster master = UTIL.getMiniHBaseCluster().getMaster();
188    master.getConnection().close();
189    RESUME.countDown();
190    UTIL.waitFor(30000, () -> !master.isAlive());
191    // here we start a new master
192    HMaster master2 = UTIL.getMiniHBaseCluster().startMaster().getMaster();
193    LOG.info("Master2 {}, joining move thread", master2.getServerName());
194    t.join();
195    // Make sure that the region is online, it may not on the original target server, as we will set
196    // forceNewPlan to true if there is a server crash
197    try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
198      table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1)));
199    }
200    // Make sure that the region is online, it may not be on the original target server, as we will
201    // set forceNewPlan to true if there is a server crash.
202    try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
203      table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1)));
204    }
205  }
206}