001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.List;
023import java.util.Optional;
024import java.util.concurrent.Future;
025import java.util.concurrent.Semaphore;
026import java.util.concurrent.TimeUnit;
027import org.apache.hadoop.hbase.Cell;
028import org.apache.hadoop.hbase.HBaseClassTestRule;
029import org.apache.hadoop.hbase.HBaseTestingUtil;
030import org.apache.hadoop.hbase.TableName;
031import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
032import org.apache.hadoop.hbase.client.AsyncAdmin;
033import org.apache.hadoop.hbase.client.BalanceRequest;
034import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
035import org.apache.hadoop.hbase.client.Durability;
036import org.apache.hadoop.hbase.client.Get;
037import org.apache.hadoop.hbase.client.Put;
038import org.apache.hadoop.hbase.client.Table;
039import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
040import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
041import org.apache.hadoop.hbase.coprocessor.ObserverContext;
042import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
043import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
044import org.apache.hadoop.hbase.coprocessor.RegionObserver;
045import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
046import org.apache.hadoop.hbase.regionserver.HRegionServer;
047import org.apache.hadoop.hbase.testclassification.LargeTests;
048import org.apache.hadoop.hbase.testclassification.MasterTests;
049import org.apache.hadoop.hbase.util.Bytes;
050import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
051import org.apache.hadoop.hbase.wal.WALEdit;
052import org.junit.AfterClass;
053import org.junit.BeforeClass;
054import org.junit.ClassRule;
055import org.junit.Test;
056import org.junit.experimental.categories.Category;
057
058/**
059 * Test to ensure that the priority for procedures and stuck checker can partially solve the problem
060 * describe in HBASE-19976, that is, RecoverMetaProcedure can finally be executed within a certain
061 * period of time.
062 */
063@Category({ MasterTests.class, LargeTests.class })
064public class TestProcedurePriority {
065
066  @ClassRule
067  public static final HBaseClassTestRule CLASS_RULE =
068    HBaseClassTestRule.forClass(TestProcedurePriority.class);
069
070  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
071
072  private static String TABLE_NAME_PREFIX = "TestProcedurePriority-";
073
074  private static byte[] CF = Bytes.toBytes("cf");
075
076  private static byte[] CQ = Bytes.toBytes("cq");
077
078  private static int CORE_POOL_SIZE;
079
080  private static int TABLE_COUNT;
081
082  private static volatile boolean FAIL = false;
083
084  public static final class MyCP implements RegionObserver, RegionCoprocessor {
085
086    @Override
087    public Optional<RegionObserver> getRegionObserver() {
088      return Optional.of(this);
089    }
090
091    @Override
092    public void preGetOp(ObserverContext<RegionCoprocessorEnvironment> c, Get get,
093      List<Cell> result) throws IOException {
094      if (FAIL && c.getEnvironment().getRegionInfo().isMetaRegion()) {
095        throw new IOException("Inject error");
096      }
097    }
098
099    @Override
100    public void prePut(ObserverContext<RegionCoprocessorEnvironment> c, Put put, WALEdit edit,
101      Durability durability) throws IOException {
102      if (FAIL && c.getEnvironment().getRegionInfo().isMetaRegion()) {
103        throw new IOException("Inject error");
104      }
105    }
106  }
107
108  @BeforeClass
109  public static void setUp() throws Exception {
110    UTIL.getConfiguration().setLong(ProcedureExecutor.WORKER_KEEP_ALIVE_TIME_CONF_KEY, 5000);
111    UTIL.getConfiguration().setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 4);
112    UTIL.getConfiguration().set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, MyCP.class.getName());
113    UTIL.startMiniCluster(3);
114    CORE_POOL_SIZE =
115      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor().getCorePoolSize();
116    TABLE_COUNT = 50 * CORE_POOL_SIZE;
117    List<Future<?>> futures = new ArrayList<>();
118    AsyncAdmin admin = UTIL.getAsyncConnection().getAdmin();
119    Semaphore concurrency = new Semaphore(10);
120    for (int i = 0; i < TABLE_COUNT; i++) {
121      concurrency.acquire();
122      futures.add(admin
123        .createTable(TableDescriptorBuilder.newBuilder(TableName.valueOf(TABLE_NAME_PREFIX + i))
124          .setColumnFamily(ColumnFamilyDescriptorBuilder.of(CF)).build())
125        .whenComplete((r, e) -> concurrency.release()));
126    }
127    for (Future<?> future : futures) {
128      future.get(3, TimeUnit.MINUTES);
129    }
130    UTIL.getAdmin().balance(BalanceRequest.newBuilder().setIgnoreRegionsInTransition(true).build());
131    UTIL.waitUntilNoRegionsInTransition();
132  }
133
134  @AfterClass
135  public static void tearDown() throws Exception {
136    UTIL.shutdownMiniCluster();
137  }
138
139  @Test
140  public void test() throws Exception {
141    RegionServerThread rsWithMetaThread = UTIL.getMiniHBaseCluster().getRegionServerThreads()
142      .stream().filter(t -> !t.getRegionServer().getRegions(TableName.META_TABLE_NAME).isEmpty())
143      .findAny().get();
144    HRegionServer rsNoMeta = UTIL.getOtherRegionServer(rsWithMetaThread.getRegionServer());
145    FAIL = true;
146    UTIL.getMiniHBaseCluster().killRegionServer(rsNoMeta.getServerName());
147    // wait until all the worker thread are stuck, which means that the stuck checker will start to
148    // add new worker thread.
149    ProcedureExecutor<?> executor =
150      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
151    UTIL.waitFor(60000, new ExplainingPredicate<Exception>() {
152
153      @Override
154      public boolean evaluate() throws Exception {
155        return executor.getWorkerThreadCount() > CORE_POOL_SIZE;
156      }
157
158      @Override
159      public String explainFailure() throws Exception {
160        return "Stuck checker does not add new worker thread";
161      }
162    });
163    UTIL.getMiniHBaseCluster().killRegionServer(rsWithMetaThread.getRegionServer().getServerName());
164    rsWithMetaThread.join();
165    FAIL = false;
166    // verify that the cluster is back
167    UTIL.waitUntilNoRegionsInTransition(480000);
168    for (int i = 0; i < TABLE_COUNT; i++) {
169      try (Table table = UTIL.getConnection().getTable(TableName.valueOf(TABLE_NAME_PREFIX + i))) {
170        table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i)));
171      }
172    }
173    UTIL.waitFor(60000, new ExplainingPredicate<Exception>() {
174
175      @Override
176      public boolean evaluate() throws Exception {
177        return executor.getWorkerThreadCount() == CORE_POOL_SIZE;
178      }
179
180      @Override
181      public String explainFailure() throws Exception {
182        return "The new workers do not timeout";
183      }
184    });
185  }
186}