001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertTrue; 022import static org.junit.Assert.fail; 023 024import java.io.IOException; 025import java.io.InterruptedIOException; 026import java.util.ArrayList; 027import java.util.Arrays; 028import java.util.Collection; 029import java.util.List; 030import org.apache.hadoop.hbase.client.Admin; 031import org.apache.hadoop.hbase.client.BalanceResponse; 032import org.apache.hadoop.hbase.client.Connection; 033import org.apache.hadoop.hbase.client.ConnectionFactory; 034import org.apache.hadoop.hbase.client.RegionInfo; 035import org.apache.hadoop.hbase.client.RegionLocator; 036import org.apache.hadoop.hbase.regionserver.HRegionServer; 037import org.apache.hadoop.hbase.testclassification.FlakeyTests; 038import org.apache.hadoop.hbase.testclassification.LargeTests; 039import org.apache.hadoop.hbase.util.Bytes; 040import org.apache.hadoop.hbase.util.JVMClusterUtil; 041import org.apache.hadoop.hbase.util.Threads; 042import org.junit.After; 043import org.junit.Before; 044import org.junit.ClassRule; 045import org.junit.Test; 046import org.junit.experimental.categories.Category; 047import org.junit.runner.RunWith; 048import org.junit.runners.Parameterized; 049import org.junit.runners.Parameterized.Parameters; 050import org.slf4j.Logger; 051import org.slf4j.LoggerFactory; 052 053import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 054 055/** 056 * Test whether region re-balancing works. (HBASE-71) The test only works for cluster wide 057 * balancing, not per table wide. Increase the margin a little to make StochasticLoadBalancer result 058 * acceptable. 059 */ 060@Category({ FlakeyTests.class, LargeTests.class }) 061@RunWith(value = Parameterized.class) 062public class TestRegionRebalancing { 063 064 @ClassRule 065 public static final HBaseClassTestRule CLASS_RULE = 066 HBaseClassTestRule.forClass(TestRegionRebalancing.class); 067 068 @Parameters 069 public static Collection<Object[]> data() { 070 Object[][] balancers = 071 new String[][] { { "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" }, 072 { "org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer" } }; 073 return Arrays.asList(balancers); 074 } 075 076 private static final byte[] FAMILY_NAME = Bytes.toBytes("col"); 077 private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class); 078 private final HBaseTestingUtility UTIL = new HBaseTestingUtility(); 079 private RegionLocator regionLocator; 080 private HTableDescriptor desc; 081 private String balancerName; 082 083 public TestRegionRebalancing(String balancerName) { 084 this.balancerName = balancerName; 085 086 } 087 088 @After 089 public void after() throws Exception { 090 UTIL.shutdownMiniCluster(); 091 } 092 093 @Before 094 public void before() throws Exception { 095 UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName); 096 // set minCostNeedBalance to 0, make sure balancer run 097 UTIL.startMiniCluster(1); 098 this.desc = new HTableDescriptor(TableName.valueOf("test")); 099 this.desc.addFamily(new HColumnDescriptor(FAMILY_NAME)); 100 } 101 102 /** 103 * For HBASE-71. Try a few different configurations of starting and stopping region servers to see 104 * if the assignment or regions is pretty balanced. nn 105 */ 106 @Test 107 public void testRebalanceOnRegionServerNumberChange() throws IOException, InterruptedException { 108 try (Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration()); 109 Admin admin = connection.getAdmin()) { 110 admin.createTable(this.desc, 111 Arrays.copyOfRange(HBaseTestingUtility.KEYS, 1, HBaseTestingUtility.KEYS.length)); 112 this.regionLocator = connection.getRegionLocator(this.desc.getTableName()); 113 114 MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection()); 115 116 assertEquals("Test table should have right number of regions", 117 HBaseTestingUtility.KEYS.length, this.regionLocator.getStartKeys().length); 118 119 // verify that the region assignments are balanced to start out 120 assertRegionsAreBalanced(); 121 122 // add a region server - total of 2 123 LOG.info("Started second server=" 124 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 125 UTIL.getHBaseCluster().getMaster().balance(); 126 assertRegionsAreBalanced(); 127 128 // On a balanced cluster, calling balance() should return true 129 BalanceResponse response = UTIL.getHBaseCluster().getMaster().balance(); 130 assertTrue(response.isBalancerRan()); 131 assertEquals(0, response.getMovesCalculated()); 132 assertEquals(0, response.getMovesExecuted()); 133 134 // if we add a server, then the balance() call should return true 135 // add a region server - total of 3 136 LOG.info("Started third server=" 137 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 138 waitForAllRegionsAssigned(); 139 140 response = UTIL.getHBaseCluster().getMaster().balance(); 141 assertTrue(response.isBalancerRan()); 142 assertTrue(response.getMovesCalculated() > 0); 143 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 144 assertRegionsAreBalanced(); 145 146 // kill a region server - total of 2 147 LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false)); 148 UTIL.getHBaseCluster().waitOnRegionServer(2); 149 waitOnCrashProcessing(); 150 UTIL.getHBaseCluster().getMaster().balance(); 151 assertRegionsAreBalanced(); 152 153 // start two more region servers - total of 4 154 LOG.info("Readding third server=" 155 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 156 LOG.info("Added fourth server=" 157 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 158 waitOnCrashProcessing(); 159 waitForAllRegionsAssigned(); 160 161 response = UTIL.getHBaseCluster().getMaster().balance(); 162 assertTrue(response.isBalancerRan()); 163 assertTrue(response.getMovesCalculated() > 0); 164 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 165 166 assertRegionsAreBalanced(); 167 for (int i = 0; i < 6; i++) { 168 LOG.info("Adding " + (i + 5) + "th region server"); 169 UTIL.getHBaseCluster().startRegionServer(); 170 } 171 waitForAllRegionsAssigned(); 172 173 response = UTIL.getHBaseCluster().getMaster().balance(); 174 assertTrue(response.isBalancerRan()); 175 assertTrue(response.getMovesCalculated() > 0); 176 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 177 178 assertRegionsAreBalanced(); 179 regionLocator.close(); 180 } 181 } 182 183 /** 184 * Wait on crash processing. Balancer won't run if processing a crashed server. 185 */ 186 private void waitOnCrashProcessing() { 187 while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) { 188 LOG.info("Waiting on processing of crashed server before proceeding..."); 189 Threads.sleep(1000); 190 } 191 } 192 193 /** 194 * Determine if regions are balanced. Figure out the total, divide by the number of online 195 * servers, then test if each server is +/- 1 of average rounded up. 196 */ 197 private void assertRegionsAreBalanced() throws IOException { 198 // TODO: Fix this test. Old balancer used to run with 'slop'. New 199 // balancer does not. 200 boolean success = false; 201 float slop = (float) UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f); 202 if (slop <= 0) slop = 1; 203 204 for (int i = 0; i < 5; i++) { 205 success = true; 206 // make sure all the regions are reassigned before we test balance 207 waitForAllRegionsAssigned(); 208 209 long regionCount = UTIL.getMiniHBaseCluster().countServedRegions(); 210 List<HRegionServer> servers = getOnlineRegionServers(); 211 double avg = (double) regionCount / (double) servers.size(); 212 int avgLoadPlusSlop = (int) Math.ceil(avg * (1 + slop)); 213 int avgLoadMinusSlop = (int) Math.floor(avg * (1 - slop)) - 1; 214 // Increase the margin a little to accommodate StochasticLoadBalancer 215 if (this.balancerName.contains("StochasticLoadBalancer")) { 216 avgLoadPlusSlop++; 217 avgLoadMinusSlop--; 218 } 219 LOG.debug("There are " + servers.size() + " servers and " + regionCount 220 + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop + ", up border: " 221 + avgLoadPlusSlop + "; attempt: " + i); 222 223 for (HRegionServer server : servers) { 224 int serverLoad = ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size(); 225 LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad); 226 if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { 227 for (RegionInfo hri : ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) { 228 if (hri.isMetaRegion()) serverLoad--; 229 // LOG.debug(hri.getRegionNameAsString()); 230 } 231 if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { 232 LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + " actual: " 233 + serverLoad + " slop: " + slop); 234 success = false; 235 break; 236 } 237 } 238 } 239 240 if (!success) { 241 // one or more servers are not balanced. sleep a little to give it a 242 // chance to catch up. then, go back to the retry loop. 243 try { 244 Thread.sleep(10000); 245 } catch (InterruptedException e) { 246 } 247 248 UTIL.getHBaseCluster().getMaster().balance(); 249 continue; 250 } 251 252 // if we get here, all servers were balanced, so we should just return. 253 return; 254 } 255 // if we get here, we tried 5 times and never got to short circuit out of 256 // the retry loop, so this is a failure. 257 fail("After 5 attempts, region assignments were not balanced."); 258 } 259 260 private List<HRegionServer> getOnlineRegionServers() { 261 List<HRegionServer> list = new ArrayList<>(); 262 for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster().getRegionServerThreads()) { 263 if (rst.getRegionServer().isOnline()) { 264 list.add(rst.getRegionServer()); 265 } 266 } 267 return list; 268 } 269 270 /** 271 * Wait until all the regions are assigned. 272 */ 273 private void waitForAllRegionsAssigned() throws IOException { 274 int totalRegions = HBaseTestingUtility.KEYS.length; 275 try { 276 Thread.sleep(200); 277 } catch (InterruptedException e) { 278 throw new InterruptedIOException(); 279 } 280 while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) { 281 // while (!cluster.getMaster().allRegionsAssigned()) { 282 LOG.debug("Waiting for there to be " + totalRegions + " regions, but there are " 283 + UTIL.getMiniHBaseCluster().countServedRegions() + " right now."); 284 try { 285 Thread.sleep(200); 286 } catch (InterruptedException e) { 287 throw new InterruptedIOException(); 288 } 289 } 290 UTIL.waitUntilNoRegionsInTransition(); 291 } 292 293}