001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.concurrent.CountDownLatch;
022import org.apache.hadoop.hbase.HBaseClassTestRule;
023import org.apache.hadoop.hbase.HBaseTestingUtility;
024import org.apache.hadoop.hbase.ProcedureTestUtil;
025import org.apache.hadoop.hbase.ServerName;
026import org.apache.hadoop.hbase.TableName;
027import org.apache.hadoop.hbase.client.Put;
028import org.apache.hadoop.hbase.client.RegionInfo;
029import org.apache.hadoop.hbase.client.Table;
030import org.apache.hadoop.hbase.master.HMaster;
031import org.apache.hadoop.hbase.master.ServerManager;
032import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
033import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
034import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
035import org.apache.hadoop.hbase.procedure2.Procedure;
036import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
037import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
038import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
039import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
040import org.apache.hadoop.hbase.regionserver.HRegionServer;
041import org.apache.hadoop.hbase.testclassification.MasterTests;
042import org.apache.hadoop.hbase.testclassification.MediumTests;
043import org.apache.hadoop.hbase.util.Bytes;
044import org.junit.AfterClass;
045import org.junit.BeforeClass;
046import org.junit.ClassRule;
047import org.junit.Test;
048import org.junit.experimental.categories.Category;
049import org.slf4j.Logger;
050import org.slf4j.LoggerFactory;
051
052/**
053 * Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the
054 * CPUs.
055 */
056@Category({ MasterTests.class, MediumTests.class })
057public class TestCloseRegionWhileRSCrash {
058  private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class);
059
060  @ClassRule
061  public static final HBaseClassTestRule CLASS_RULE =
062    HBaseClassTestRule.forClass(TestCloseRegionWhileRSCrash.class);
063
064  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
065
066  private static TableName TABLE_NAME = TableName.valueOf("Backoff");
067
068  private static byte[] CF = Bytes.toBytes("cf");
069
070  private static CountDownLatch ARRIVE = new CountDownLatch(1);
071
072  private static CountDownLatch RESUME = new CountDownLatch(1);
073
074  public static final class DummyServerProcedure extends Procedure<MasterProcedureEnv>
075    implements ServerProcedureInterface {
076
077    private ServerName serverName;
078
079    public DummyServerProcedure() {
080    }
081
082    public DummyServerProcedure(ServerName serverName) {
083      this.serverName = serverName;
084    }
085
086    @Override
087    public ServerName getServerName() {
088      return serverName;
089    }
090
091    @Override
092    public boolean hasMetaTableRegion() {
093      return false;
094    }
095
096    @Override
097    public ServerOperationType getServerOperationType() {
098      return ServerOperationType.CRASH_HANDLER;
099    }
100
101    @Override
102    protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
103      throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
104      ARRIVE.countDown();
105      RESUME.await();
106      return null;
107    }
108
109    @Override
110    protected LockState acquireLock(final MasterProcedureEnv env) {
111      if (env.getProcedureScheduler().waitServerExclusiveLock(this, getServerName())) {
112        return LockState.LOCK_EVENT_WAIT;
113      }
114      return LockState.LOCK_ACQUIRED;
115    }
116
117    @Override
118    protected void releaseLock(final MasterProcedureEnv env) {
119      env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName());
120    }
121
122    @Override
123    protected boolean holdLock(MasterProcedureEnv env) {
124      return true;
125    }
126
127    @Override
128    protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException {
129    }
130
131    @Override
132    protected boolean abort(MasterProcedureEnv env) {
133      return false;
134    }
135
136    @Override
137    protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
138
139    }
140
141    @Override
142    protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
143    }
144  }
145
146  @BeforeClass
147  public static void setUp() throws Exception {
148    UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
149    UTIL.startMiniCluster(3);
150    UTIL.createTable(TABLE_NAME, CF);
151    UTIL.getAdmin().balancerSwitch(false, true);
152    HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
153    if (!srcRs.getRegions(TableName.META_TABLE_NAME).isEmpty()) {
154      RegionInfo metaRegion = srcRs.getRegions(TableName.META_TABLE_NAME).get(0).getRegionInfo();
155      HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
156      UTIL.getAdmin().move(metaRegion.getEncodedNameAsBytes(), dstRs.getServerName());
157      UTIL.waitFor(30000, () -> !dstRs.getRegions(TableName.META_TABLE_NAME).isEmpty());
158    }
159  }
160
161  @AfterClass
162  public static void tearDown() throws Exception {
163    UTIL.shutdownMiniCluster();
164  }
165
166  @Test
167  public void testRetryBackoff() throws IOException, InterruptedException {
168    HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
169    RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo();
170    HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
171    ProcedureExecutor<MasterProcedureEnv> procExec =
172      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
173    procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
174    ARRIVE.await();
175    UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName());
176    UTIL.waitFor(30000,
177      () -> procExec.getProcedures().stream().anyMatch(p -> p instanceof ServerCrashProcedure));
178    Thread t = new Thread(() -> {
179      try {
180        UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName());
181      } catch (IOException e) {
182        LOG.info("Failed move of {}", region.getRegionNameAsString(), e);
183      }
184    });
185    t.start();
186    // wait until we enter the WAITING_TIMEOUT state
187    ProcedureTestUtil.waitUntilProcedureWaitingTimeout(UTIL, TransitRegionStateProcedure.class,
188      30000);
189    // wait until the timeout value increase three times
190    ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3);
191    // close connection to make sure that we can not finish the TRSP
192    final HMaster master = UTIL.getMiniHBaseCluster().getMaster();
193    master.getConnection().close();
194    RESUME.countDown();
195    UTIL.waitFor(30000, () -> !master.isAlive());
196    // here we start a new master
197    HMaster master2 = UTIL.getMiniHBaseCluster().startMaster().getMaster();
198    LOG.info("Master2 {}, joining move thread", master2.getServerName());
199    t.join();
200    // Make sure that the region is online, it may not on the original target server, as we will set
201    // forceNewPlan to true if there is a server crash
202    try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
203      table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1)));
204    }
205    // Make sure that the region is online, it may not be on the original target server, as we will
206    // set forceNewPlan to true if there is a server crash.
207    try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
208      table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1)));
209    }
210  }
211}