001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.fail;
022
023import java.io.IOException;
024import java.io.InterruptedIOException;
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.Collection;
028import java.util.List;
029import org.apache.hadoop.hbase.client.Admin;
030import org.apache.hadoop.hbase.client.Connection;
031import org.apache.hadoop.hbase.client.ConnectionFactory;
032import org.apache.hadoop.hbase.client.RegionInfo;
033import org.apache.hadoop.hbase.client.RegionLocator;
034import org.apache.hadoop.hbase.regionserver.HRegionServer;
035import org.apache.hadoop.hbase.testclassification.FlakeyTests;
036import org.apache.hadoop.hbase.testclassification.LargeTests;
037import org.apache.hadoop.hbase.util.Bytes;
038import org.apache.hadoop.hbase.util.JVMClusterUtil;
039import org.apache.hadoop.hbase.util.Threads;
040import org.junit.After;
041import org.junit.Before;
042import org.junit.ClassRule;
043import org.junit.Test;
044import org.junit.experimental.categories.Category;
045import org.junit.runner.RunWith;
046import org.junit.runners.Parameterized;
047import org.junit.runners.Parameterized.Parameters;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
052
053/**
054 * Test whether region re-balancing works. (HBASE-71)
055 * The test only works for cluster wide balancing, not per table wide.
056 * Increase the margin a little to make StochasticLoadBalancer result acceptable.
057 */
058@Category({FlakeyTests.class, LargeTests.class})
059@RunWith(value = Parameterized.class)
060public class TestRegionRebalancing {
061
062  @ClassRule
063  public static final HBaseClassTestRule CLASS_RULE =
064      HBaseClassTestRule.forClass(TestRegionRebalancing.class);
065
066  @Parameters
067  public static Collection<Object[]> data() {
068    Object[][] balancers =
069        new String[][] { { "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" },
070            { "org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer" } };
071    return Arrays.asList(balancers);
072  }
073
074  private static final byte[] FAMILY_NAME = Bytes.toBytes("col");
075  private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class);
076  private final HBaseTestingUtility UTIL = new HBaseTestingUtility();
077  private RegionLocator regionLocator;
078  private HTableDescriptor desc;
079  private String balancerName;
080
081  public TestRegionRebalancing(String balancerName) {
082    this.balancerName = balancerName;
083
084  }
085
086  @After
087  public void after() throws Exception {
088    UTIL.shutdownMiniCluster();
089  }
090
091  @Before
092  public void before() throws Exception {
093    UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName);
094    // set minCostNeedBalance to 0, make sure balancer run
095    UTIL.startMiniCluster(1);
096    this.desc = new HTableDescriptor(TableName.valueOf("test"));
097    this.desc.addFamily(new HColumnDescriptor(FAMILY_NAME));
098  }
099
100  /**
101   * For HBASE-71. Try a few different configurations of starting and stopping
102   * region servers to see if the assignment or regions is pretty balanced.
103   * @throws IOException
104   * @throws InterruptedException
105   */
106  @Test
107  public void testRebalanceOnRegionServerNumberChange()
108  throws IOException, InterruptedException {
109    try(Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration());
110        Admin admin = connection.getAdmin()) {
111      admin.createTable(this.desc, Arrays.copyOfRange(HBaseTestingUtility.KEYS,
112          1, HBaseTestingUtility.KEYS.length));
113      this.regionLocator = connection.getRegionLocator(this.desc.getTableName());
114
115      MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection());
116
117      assertEquals("Test table should have right number of regions",
118        HBaseTestingUtility.KEYS.length,
119        this.regionLocator.getStartKeys().length);
120
121      // verify that the region assignments are balanced to start out
122      assertRegionsAreBalanced();
123
124      // add a region server - total of 2
125      LOG.info("Started second server=" +
126        UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
127      UTIL.getHBaseCluster().getMaster().balance();
128      assertRegionsAreBalanced();
129
130      // On a balanced cluster, calling balance() should return true
131      assert(UTIL.getHBaseCluster().getMaster().balance() == true);
132
133      // if we add a server, then the balance() call should return true
134      // add a region server - total of 3
135      LOG.info("Started third server=" +
136          UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
137      waitForAllRegionsAssigned();
138      assert(UTIL.getHBaseCluster().getMaster().balance() == true);
139      assertRegionsAreBalanced();
140
141      // kill a region server - total of 2
142      LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false));
143      UTIL.getHBaseCluster().waitOnRegionServer(2);
144      waitOnCrashProcessing();
145      UTIL.getHBaseCluster().getMaster().balance();
146      assertRegionsAreBalanced();
147
148      // start two more region servers - total of 4
149      LOG.info("Readding third server=" +
150          UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
151      LOG.info("Added fourth server=" +
152          UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
153      waitOnCrashProcessing();
154      waitForAllRegionsAssigned();
155      assert(UTIL.getHBaseCluster().getMaster().balance() == true);
156      assertRegionsAreBalanced();
157      for (int i = 0; i < 6; i++){
158        LOG.info("Adding " + (i + 5) + "th region server");
159        UTIL.getHBaseCluster().startRegionServer();
160      }
161      waitForAllRegionsAssigned();
162      assert(UTIL.getHBaseCluster().getMaster().balance() == true);
163      assertRegionsAreBalanced();
164      regionLocator.close();
165    }
166  }
167
168  /**
169   * Wait on crash processing. Balancer won't run if processing a crashed server.
170   */
171  private void waitOnCrashProcessing() {
172    while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) {
173      LOG.info("Waiting on processing of crashed server before proceeding...");
174      Threads.sleep(1000);
175    }
176  }
177
178  /**
179   * Determine if regions are balanced. Figure out the total, divide by the
180   * number of online servers, then test if each server is +/- 1 of average
181   * rounded up.
182   */
183  private void assertRegionsAreBalanced() throws IOException {
184    // TODO: Fix this test.  Old balancer used to run with 'slop'.  New
185    // balancer does not.
186    boolean success = false;
187    float slop = (float)UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f);
188    if (slop <= 0) slop = 1;
189
190    for (int i = 0; i < 5; i++) {
191      success = true;
192      // make sure all the regions are reassigned before we test balance
193      waitForAllRegionsAssigned();
194
195      long regionCount = UTIL.getMiniHBaseCluster().countServedRegions();
196      List<HRegionServer> servers = getOnlineRegionServers();
197      double avg = (double)regionCount / (double)servers.size();
198      int avgLoadPlusSlop = (int)Math.ceil(avg * (1 + slop));
199      int avgLoadMinusSlop = (int)Math.floor(avg * (1 - slop)) - 1;
200      // Increase the margin a little to accommodate StochasticLoadBalancer
201      if (this.balancerName.contains("StochasticLoadBalancer")) {
202        avgLoadPlusSlop++;
203        avgLoadMinusSlop--;
204      }
205      LOG.debug("There are " + servers.size() + " servers and " + regionCount
206        + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop
207        + ", up border: " + avgLoadPlusSlop + "; attempt: " + i);
208
209      for (HRegionServer server : servers) {
210        int serverLoad =
211          ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size();
212        LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad);
213        if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop
214            && serverLoad >= avgLoadMinusSlop)) {
215          for (RegionInfo hri :
216              ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) {
217            if (hri.isMetaRegion()) serverLoad--;
218            // LOG.debug(hri.getRegionNameAsString());
219          }
220          if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) {
221            LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg +
222                " actual: " + serverLoad + " slop: " + slop);
223            success = false;
224            break;
225          }
226        }
227      }
228
229      if (!success) {
230        // one or more servers are not balanced. sleep a little to give it a
231        // chance to catch up. then, go back to the retry loop.
232        try {
233          Thread.sleep(10000);
234        } catch (InterruptedException e) {}
235
236        UTIL.getHBaseCluster().getMaster().balance();
237        continue;
238      }
239
240      // if we get here, all servers were balanced, so we should just return.
241      return;
242    }
243    // if we get here, we tried 5 times and never got to short circuit out of
244    // the retry loop, so this is a failure.
245    fail("After 5 attempts, region assignments were not balanced.");
246  }
247
248  private List<HRegionServer> getOnlineRegionServers() {
249    List<HRegionServer> list = new ArrayList<>();
250    for (JVMClusterUtil.RegionServerThread rst :
251        UTIL.getHBaseCluster().getRegionServerThreads()) {
252      if (rst.getRegionServer().isOnline()) {
253        list.add(rst.getRegionServer());
254      }
255    }
256    return list;
257  }
258
259  /**
260   * Wait until all the regions are assigned.
261   */
262  private void waitForAllRegionsAssigned() throws IOException {
263    int totalRegions = HBaseTestingUtility.KEYS.length;
264    try {
265        Thread.sleep(200);
266    } catch (InterruptedException e) {
267      throw new InterruptedIOException();
268    }
269    while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) {
270    // while (!cluster.getMaster().allRegionsAssigned()) {
271      LOG.debug("Waiting for there to be "+ totalRegions +" regions, but there are "
272        + UTIL.getMiniHBaseCluster().countServedRegions() + " right now.");
273      try {
274        Thread.sleep(200);
275      } catch (InterruptedException e) {
276        throw new InterruptedIOException();
277      }
278    }
279    UTIL.waitUntilNoRegionsInTransition();
280  }
281
282}
283