001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertTrue;
022import static org.junit.Assert.fail;
023
024import java.io.IOException;
025import java.io.InterruptedIOException;
026import java.util.ArrayList;
027import java.util.Arrays;
028import java.util.Collection;
029import java.util.List;
030import org.apache.hadoop.hbase.client.Admin;
031import org.apache.hadoop.hbase.client.BalanceResponse;
032import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
033import org.apache.hadoop.hbase.client.Connection;
034import org.apache.hadoop.hbase.client.ConnectionFactory;
035import org.apache.hadoop.hbase.client.RegionInfo;
036import org.apache.hadoop.hbase.client.RegionLocator;
037import org.apache.hadoop.hbase.client.TableDescriptor;
038import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
039import org.apache.hadoop.hbase.regionserver.HRegionServer;
040import org.apache.hadoop.hbase.testclassification.FlakeyTests;
041import org.apache.hadoop.hbase.testclassification.LargeTests;
042import org.apache.hadoop.hbase.util.Bytes;
043import org.apache.hadoop.hbase.util.JVMClusterUtil;
044import org.apache.hadoop.hbase.util.Threads;
045import org.junit.After;
046import org.junit.Before;
047import org.junit.ClassRule;
048import org.junit.Test;
049import org.junit.experimental.categories.Category;
050import org.junit.runner.RunWith;
051import org.junit.runners.Parameterized;
052import org.junit.runners.Parameterized.Parameters;
053import org.slf4j.Logger;
054import org.slf4j.LoggerFactory;
055
056import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
057
058/**
059 * Test whether region re-balancing works. (HBASE-71) The test only works for cluster wide
060 * balancing, not per table wide. Increase the margin a little to make StochasticLoadBalancer result
061 * acceptable.
062 */
063@Category({ FlakeyTests.class, LargeTests.class })
064@RunWith(value = Parameterized.class)
065public class TestRegionRebalancing {
066
067  @ClassRule
068  public static final HBaseClassTestRule CLASS_RULE =
069    HBaseClassTestRule.forClass(TestRegionRebalancing.class);
070
071  @Parameters
072  public static Collection<Object[]> data() {
073    Object[][] balancers =
074      new String[][] { { "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" },
075        { "org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer" } };
076    return Arrays.asList(balancers);
077  }
078
079  private static final byte[] FAMILY_NAME = Bytes.toBytes("col");
080  private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class);
081  private final HBaseTestingUtil UTIL = new HBaseTestingUtil();
082  private RegionLocator regionLocator;
083  private TableDescriptor tableDescriptor;
084  private String balancerName;
085
086  public TestRegionRebalancing(String balancerName) {
087    this.balancerName = balancerName;
088
089  }
090
091  @After
092  public void after() throws Exception {
093    UTIL.shutdownMiniCluster();
094  }
095
096  @Before
097  public void before() throws Exception {
098    UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName);
099    // set minCostNeedBalance to 0, make sure balancer run
100    UTIL.startMiniCluster(1);
101
102    this.tableDescriptor = TableDescriptorBuilder.newBuilder(TableName.valueOf("test"))
103      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY_NAME)).build();
104  }
105
106  /**
107   * For HBASE-71. Try a few different configurations of starting and stopping region servers to see
108   * if the assignment or regions is pretty balanced.
109   */
110  @Test
111  public void testRebalanceOnRegionServerNumberChange() throws IOException, InterruptedException {
112    try (Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration());
113      Admin admin = connection.getAdmin()) {
114      admin.createTable(this.tableDescriptor,
115        Arrays.copyOfRange(HBaseTestingUtil.KEYS, 1, HBaseTestingUtil.KEYS.length));
116      this.regionLocator = connection.getRegionLocator(this.tableDescriptor.getTableName());
117
118      MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection());
119
120      assertEquals("Test table should have right number of regions", HBaseTestingUtil.KEYS.length,
121        this.regionLocator.getStartKeys().length);
122
123      // verify that the region assignments are balanced to start out
124      assertRegionsAreBalanced();
125
126      // add a region server - total of 2
127      LOG.info("Started second server="
128        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
129      UTIL.getHBaseCluster().getMaster().balance();
130      assertRegionsAreBalanced();
131
132      // On a balanced cluster, calling balance() should return true
133      BalanceResponse response = UTIL.getHBaseCluster().getMaster().balance();
134      assertTrue(response.isBalancerRan());
135      assertEquals(0, response.getMovesCalculated());
136      assertEquals(0, response.getMovesExecuted());
137
138      // if we add a server, then the balance() call should return true
139      // add a region server - total of 3
140      LOG.info("Started third server="
141        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
142      waitForAllRegionsAssigned();
143
144      response = UTIL.getHBaseCluster().getMaster().balance();
145      assertTrue(response.isBalancerRan());
146      assertTrue(response.getMovesCalculated() > 0);
147      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
148      assertRegionsAreBalanced();
149
150      // kill a region server - total of 2
151      LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false));
152      UTIL.getHBaseCluster().waitOnRegionServer(2);
153      waitOnCrashProcessing();
154      UTIL.getHBaseCluster().getMaster().balance();
155      assertRegionsAreBalanced();
156
157      // start two more region servers - total of 4
158      LOG.info("Readding third server="
159        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
160      LOG.info("Added fourth server="
161        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
162      waitOnCrashProcessing();
163      waitForAllRegionsAssigned();
164
165      response = UTIL.getHBaseCluster().getMaster().balance();
166      assertTrue(response.isBalancerRan());
167      assertTrue(response.getMovesCalculated() > 0);
168      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
169
170      assertRegionsAreBalanced();
171      for (int i = 0; i < 6; i++) {
172        LOG.info("Adding " + (i + 5) + "th region server");
173        UTIL.getHBaseCluster().startRegionServer();
174      }
175      waitForAllRegionsAssigned();
176
177      response = UTIL.getHBaseCluster().getMaster().balance();
178      assertTrue(response.isBalancerRan());
179      assertTrue(response.getMovesCalculated() > 0);
180      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
181
182      assertRegionsAreBalanced();
183      regionLocator.close();
184    }
185  }
186
187  /**
188   * Wait on crash processing. Balancer won't run if processing a crashed server.
189   */
190  private void waitOnCrashProcessing() throws IOException {
191    while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) {
192      LOG.info("Waiting on processing of crashed server before proceeding...");
193      Threads.sleep(1000);
194    }
195  }
196
197  /**
198   * Determine if regions are balanced. Figure out the total, divide by the number of online
199   * servers, then test if each server is +/- 1 of average rounded up.
200   */
201  private void assertRegionsAreBalanced() throws IOException {
202    // TODO: Fix this test. Old balancer used to run with 'slop'. New
203    // balancer does not.
204    boolean success = false;
205    float slop = (float) UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f);
206    if (slop <= 0) slop = 1;
207
208    for (int i = 0; i < 5; i++) {
209      success = true;
210      // make sure all the regions are reassigned before we test balance
211      waitForAllRegionsAssigned();
212
213      long regionCount = UTIL.getMiniHBaseCluster().countServedRegions();
214      List<HRegionServer> servers = getOnlineRegionServers();
215      double avg = (double) regionCount / (double) servers.size();
216      int avgLoadPlusSlop = (int) Math.ceil(avg * (1 + slop));
217      int avgLoadMinusSlop = (int) Math.floor(avg * (1 - slop)) - 1;
218      // Increase the margin a little to accommodate StochasticLoadBalancer
219      if (this.balancerName.contains("StochasticLoadBalancer")) {
220        avgLoadPlusSlop++;
221        avgLoadMinusSlop--;
222      }
223      LOG.debug("There are " + servers.size() + " servers and " + regionCount
224        + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop + ", up border: "
225        + avgLoadPlusSlop + "; attempt: " + i);
226
227      for (HRegionServer server : servers) {
228        int serverLoad = ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size();
229        LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad);
230        if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) {
231          for (RegionInfo hri : ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) {
232            if (hri.isMetaRegion()) serverLoad--;
233            // LOG.debug(hri.getRegionNameAsString());
234          }
235          if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) {
236            LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + " actual: "
237              + serverLoad + " slop: " + slop);
238            success = false;
239            break;
240          }
241        }
242      }
243
244      if (!success) {
245        // one or more servers are not balanced. sleep a little to give it a
246        // chance to catch up. then, go back to the retry loop.
247        try {
248          Thread.sleep(10000);
249        } catch (InterruptedException e) {
250        }
251
252        UTIL.getHBaseCluster().getMaster().balance();
253        continue;
254      }
255
256      // if we get here, all servers were balanced, so we should just return.
257      return;
258    }
259    // if we get here, we tried 5 times and never got to short circuit out of
260    // the retry loop, so this is a failure.
261    fail("After 5 attempts, region assignments were not balanced.");
262  }
263
264  private List<HRegionServer> getOnlineRegionServers() {
265    List<HRegionServer> list = new ArrayList<>();
266    for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster().getRegionServerThreads()) {
267      if (rst.getRegionServer().isOnline()) {
268        list.add(rst.getRegionServer());
269      }
270    }
271    return list;
272  }
273
274  /**
275   * Wait until all the regions are assigned.
276   */
277  private void waitForAllRegionsAssigned() throws IOException {
278    int totalRegions = HBaseTestingUtil.KEYS.length;
279    try {
280      Thread.sleep(200);
281    } catch (InterruptedException e) {
282      throw new InterruptedIOException();
283    }
284    while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) {
285      // while (!cluster.getMaster().allRegionsAssigned()) {
286      LOG.debug("Waiting for there to be " + totalRegions + " regions, but there are "
287        + UTIL.getMiniHBaseCluster().countServedRegions() + " right now.");
288      try {
289        Thread.sleep(200);
290      } catch (InterruptedException e) {
291        throw new InterruptedIOException();
292      }
293    }
294    UTIL.waitUntilNoRegionsInTransition();
295  }
296
297}