001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import static org.junit.jupiter.api.Assertions.assertEquals;
021import static org.junit.jupiter.api.Assertions.assertTrue;
022import static org.junit.jupiter.api.Assertions.fail;
023
024import java.io.IOException;
025import java.io.InterruptedIOException;
026import java.util.ArrayList;
027import java.util.Arrays;
028import java.util.List;
029import java.util.stream.Stream;
030import org.apache.hadoop.hbase.client.Admin;
031import org.apache.hadoop.hbase.client.BalanceResponse;
032import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
033import org.apache.hadoop.hbase.client.Connection;
034import org.apache.hadoop.hbase.client.ConnectionFactory;
035import org.apache.hadoop.hbase.client.RegionInfo;
036import org.apache.hadoop.hbase.client.RegionLocator;
037import org.apache.hadoop.hbase.client.TableDescriptor;
038import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
039import org.apache.hadoop.hbase.regionserver.HRegionServer;
040import org.apache.hadoop.hbase.testclassification.FlakeyTests;
041import org.apache.hadoop.hbase.testclassification.LargeTests;
042import org.apache.hadoop.hbase.util.Bytes;
043import org.apache.hadoop.hbase.util.JVMClusterUtil;
044import org.apache.hadoop.hbase.util.Threads;
045import org.junit.jupiter.api.AfterEach;
046import org.junit.jupiter.api.BeforeEach;
047import org.junit.jupiter.api.Tag;
048import org.junit.jupiter.api.TestTemplate;
049import org.junit.jupiter.params.provider.Arguments;
050import org.slf4j.Logger;
051import org.slf4j.LoggerFactory;
052
053import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
054
055/**
056 * Test whether region re-balancing works. (HBASE-71) The test only works for cluster wide
057 * balancing, not per table wide. Increase the margin a little to make StochasticLoadBalancer result
058 * acceptable.
059 */
060@Tag(FlakeyTests.TAG)
061@Tag(LargeTests.TAG)
062@HBaseParameterizedTestTemplate(name = "{index}: balancer = {0}")
063public class TestRegionRebalancing {
064
065  public static Stream<Arguments> parameters() {
066    return Stream.of(Arguments.of("org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer"),
067      Arguments.of("org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer"));
068  }
069
070  private static final byte[] FAMILY_NAME = Bytes.toBytes("col");
071  private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class);
072  private final HBaseTestingUtil UTIL = new HBaseTestingUtil();
073  private RegionLocator regionLocator;
074  private TableDescriptor tableDescriptor;
075  private String balancerName;
076
077  public TestRegionRebalancing(String balancerName) {
078    this.balancerName = balancerName;
079
080  }
081
082  @AfterEach
083  public void after() throws Exception {
084    UTIL.shutdownMiniCluster();
085  }
086
087  @BeforeEach
088  public void before() throws Exception {
089    UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName);
090    // set minCostNeedBalance to 0, make sure balancer run
091    UTIL.startMiniCluster(1);
092
093    this.tableDescriptor = TableDescriptorBuilder.newBuilder(TableName.valueOf("test"))
094      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY_NAME)).build();
095  }
096
097  /**
098   * For HBASE-71. Try a few different configurations of starting and stopping region servers to see
099   * if the assignment or regions is pretty balanced.
100   */
101  @TestTemplate
102  public void testRebalanceOnRegionServerNumberChange() throws IOException, InterruptedException {
103    try (Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration());
104      Admin admin = connection.getAdmin()) {
105      admin.createTable(this.tableDescriptor,
106        Arrays.copyOfRange(HBaseTestingUtil.KEYS, 1, HBaseTestingUtil.KEYS.length));
107      this.regionLocator = connection.getRegionLocator(this.tableDescriptor.getTableName());
108
109      MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection());
110
111      assertEquals(HBaseTestingUtil.KEYS.length, this.regionLocator.getStartKeys().length,
112        "Test table should have right number of regions");
113
114      // verify that the region assignments are balanced to start out
115      assertRegionsAreBalanced();
116
117      // add a region server - total of 2
118      LOG.info("Started second server="
119        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
120      UTIL.getHBaseCluster().getMaster().balance();
121      assertRegionsAreBalanced();
122
123      // On a balanced cluster, calling balance() should return true
124      BalanceResponse response = UTIL.getHBaseCluster().getMaster().balance();
125      assertTrue(response.isBalancerRan());
126      assertEquals(0, response.getMovesCalculated());
127      assertEquals(0, response.getMovesExecuted());
128
129      // if we add a server, then the balance() call should return true
130      // add a region server - total of 3
131      LOG.info("Started third server="
132        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
133      waitForAllRegionsAssigned();
134
135      response = UTIL.getHBaseCluster().getMaster().balance();
136      assertTrue(response.isBalancerRan());
137      assertTrue(response.getMovesCalculated() > 0);
138      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
139      assertRegionsAreBalanced();
140
141      // kill a region server - total of 2
142      LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false));
143      UTIL.getHBaseCluster().waitOnRegionServer(2);
144      waitOnCrashProcessing();
145      UTIL.getHBaseCluster().getMaster().balance();
146      assertRegionsAreBalanced();
147
148      // start two more region servers - total of 4
149      LOG.info("Readding third server="
150        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
151      LOG.info("Added fourth server="
152        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
153      waitOnCrashProcessing();
154      waitForAllRegionsAssigned();
155
156      response = UTIL.getHBaseCluster().getMaster().balance();
157      assertTrue(response.isBalancerRan());
158      assertTrue(response.getMovesCalculated() > 0);
159      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
160
161      assertRegionsAreBalanced();
162      for (int i = 0; i < 6; i++) {
163        LOG.info("Adding " + (i + 5) + "th region server");
164        UTIL.getHBaseCluster().startRegionServer();
165      }
166      waitForAllRegionsAssigned();
167
168      response = UTIL.getHBaseCluster().getMaster().balance();
169      assertTrue(response.isBalancerRan());
170      assertTrue(response.getMovesCalculated() > 0);
171      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
172
173      assertRegionsAreBalanced();
174      regionLocator.close();
175    }
176  }
177
178  /**
179   * Wait on crash processing. Balancer won't run if processing a crashed server.
180   */
181  private void waitOnCrashProcessing() throws IOException {
182    while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) {
183      LOG.info("Waiting on processing of crashed server before proceeding...");
184      Threads.sleep(1000);
185    }
186  }
187
188  /**
189   * Determine if regions are balanced. Figure out the total, divide by the number of online
190   * servers, then test if each server is +/- 1 of average rounded up.
191   */
192  private void assertRegionsAreBalanced() throws IOException {
193    // TODO: Fix this test. Old balancer used to run with 'slop'. New
194    // balancer does not.
195    boolean success = false;
196    float slop = (float) UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f);
197    if (slop <= 0) slop = 1;
198
199    for (int i = 0; i < 5; i++) {
200      success = true;
201      // make sure all the regions are reassigned before we test balance
202      waitForAllRegionsAssigned();
203
204      long regionCount = UTIL.getMiniHBaseCluster().countServedRegions();
205      List<HRegionServer> servers = getOnlineRegionServers();
206      double avg = (double) regionCount / (double) servers.size();
207      int avgLoadPlusSlop = (int) Math.ceil(avg * (1 + slop));
208      int avgLoadMinusSlop = (int) Math.floor(avg * (1 - slop)) - 1;
209      // Increase the margin a little to accommodate StochasticLoadBalancer
210      if (this.balancerName.contains("StochasticLoadBalancer")) {
211        avgLoadPlusSlop++;
212        avgLoadMinusSlop--;
213      }
214      LOG.debug("There are " + servers.size() + " servers and " + regionCount
215        + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop + ", up border: "
216        + avgLoadPlusSlop + "; attempt: " + i);
217
218      for (HRegionServer server : servers) {
219        int serverLoad = ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size();
220        LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad);
221        if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) {
222          for (RegionInfo hri : ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) {
223            if (hri.isMetaRegion()) serverLoad--;
224            // LOG.debug(hri.getRegionNameAsString());
225          }
226          if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) {
227            LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + " actual: "
228              + serverLoad + " slop: " + slop);
229            success = false;
230            break;
231          }
232        }
233      }
234
235      if (!success) {
236        // one or more servers are not balanced. sleep a little to give it a
237        // chance to catch up. then, go back to the retry loop.
238        try {
239          Thread.sleep(10000);
240        } catch (InterruptedException e) {
241        }
242
243        UTIL.getHBaseCluster().getMaster().balance();
244        continue;
245      }
246
247      // if we get here, all servers were balanced, so we should just return.
248      return;
249    }
250    // if we get here, we tried 5 times and never got to short circuit out of
251    // the retry loop, so this is a failure.
252    fail("After 5 attempts, region assignments were not balanced.");
253  }
254
255  private List<HRegionServer> getOnlineRegionServers() {
256    List<HRegionServer> list = new ArrayList<>();
257    for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster().getRegionServerThreads()) {
258      if (rst.getRegionServer().isOnline()) {
259        list.add(rst.getRegionServer());
260      }
261    }
262    return list;
263  }
264
265  /**
266   * Wait until all the regions are assigned.
267   */
268  private void waitForAllRegionsAssigned() throws IOException {
269    int totalRegions = HBaseTestingUtil.KEYS.length;
270    try {
271      Thread.sleep(200);
272    } catch (InterruptedException e) {
273      throw new InterruptedIOException();
274    }
275    while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) {
276      // while (!cluster.getMaster().allRegionsAssigned()) {
277      LOG.debug("Waiting for there to be " + totalRegions + " regions, but there are "
278        + UTIL.getMiniHBaseCluster().countServedRegions() + " right now.");
279      try {
280        Thread.sleep(200);
281      } catch (InterruptedException e) {
282        throw new InterruptedIOException();
283      }
284    }
285    UTIL.waitUntilNoRegionsInTransition();
286  }
287
288}