001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.fail; 022 023import java.io.IOException; 024import java.io.InterruptedIOException; 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.Collection; 028import java.util.List; 029import org.apache.hadoop.hbase.client.Admin; 030import org.apache.hadoop.hbase.client.Connection; 031import org.apache.hadoop.hbase.client.ConnectionFactory; 032import org.apache.hadoop.hbase.client.RegionInfo; 033import org.apache.hadoop.hbase.client.RegionLocator; 034import org.apache.hadoop.hbase.regionserver.HRegionServer; 035import org.apache.hadoop.hbase.testclassification.FlakeyTests; 036import org.apache.hadoop.hbase.testclassification.LargeTests; 037import org.apache.hadoop.hbase.util.Bytes; 038import org.apache.hadoop.hbase.util.JVMClusterUtil; 039import org.apache.hadoop.hbase.util.Threads; 040import org.junit.After; 041import org.junit.Before; 042import org.junit.ClassRule; 043import org.junit.Test; 044import org.junit.experimental.categories.Category; 045import org.junit.runner.RunWith; 046import org.junit.runners.Parameterized; 047import org.junit.runners.Parameterized.Parameters; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 052 053/** 054 * Test whether region re-balancing works. (HBASE-71) 055 * The test only works for cluster wide balancing, not per table wide. 056 * Increase the margin a little to make StochasticLoadBalancer result acceptable. 057 */ 058@Category({FlakeyTests.class, LargeTests.class}) 059@RunWith(value = Parameterized.class) 060public class TestRegionRebalancing { 061 062 @ClassRule 063 public static final HBaseClassTestRule CLASS_RULE = 064 HBaseClassTestRule.forClass(TestRegionRebalancing.class); 065 066 @Parameters 067 public static Collection<Object[]> data() { 068 Object[][] balancers = 069 new String[][] { { "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" }, 070 { "org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer" } }; 071 return Arrays.asList(balancers); 072 } 073 074 private static final byte[] FAMILY_NAME = Bytes.toBytes("col"); 075 private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class); 076 private final HBaseTestingUtility UTIL = new HBaseTestingUtility(); 077 private RegionLocator regionLocator; 078 private HTableDescriptor desc; 079 private String balancerName; 080 081 public TestRegionRebalancing(String balancerName) { 082 this.balancerName = balancerName; 083 084 } 085 086 @After 087 public void after() throws Exception { 088 UTIL.shutdownMiniCluster(); 089 } 090 091 @Before 092 public void before() throws Exception { 093 UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName); 094 // set minCostNeedBalance to 0, make sure balancer run 095 UTIL.startMiniCluster(1); 096 this.desc = new HTableDescriptor(TableName.valueOf("test")); 097 this.desc.addFamily(new HColumnDescriptor(FAMILY_NAME)); 098 } 099 100 /** 101 * For HBASE-71. Try a few different configurations of starting and stopping 102 * region servers to see if the assignment or regions is pretty balanced. 103 * @throws IOException 104 * @throws InterruptedException 105 */ 106 @Test 107 public void testRebalanceOnRegionServerNumberChange() 108 throws IOException, InterruptedException { 109 try(Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration()); 110 Admin admin = connection.getAdmin()) { 111 admin.createTable(this.desc, Arrays.copyOfRange(HBaseTestingUtility.KEYS, 112 1, HBaseTestingUtility.KEYS.length)); 113 this.regionLocator = connection.getRegionLocator(this.desc.getTableName()); 114 115 MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection()); 116 117 assertEquals("Test table should have right number of regions", 118 HBaseTestingUtility.KEYS.length, 119 this.regionLocator.getStartKeys().length); 120 121 // verify that the region assignments are balanced to start out 122 assertRegionsAreBalanced(); 123 124 // add a region server - total of 2 125 LOG.info("Started second server=" + 126 UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 127 UTIL.getHBaseCluster().getMaster().balance(); 128 assertRegionsAreBalanced(); 129 130 // On a balanced cluster, calling balance() should return true 131 assert(UTIL.getHBaseCluster().getMaster().balance() == true); 132 133 // if we add a server, then the balance() call should return true 134 // add a region server - total of 3 135 LOG.info("Started third server=" + 136 UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 137 waitForAllRegionsAssigned(); 138 assert(UTIL.getHBaseCluster().getMaster().balance() == true); 139 assertRegionsAreBalanced(); 140 141 // kill a region server - total of 2 142 LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false)); 143 UTIL.getHBaseCluster().waitOnRegionServer(2); 144 waitOnCrashProcessing(); 145 UTIL.getHBaseCluster().getMaster().balance(); 146 assertRegionsAreBalanced(); 147 148 // start two more region servers - total of 4 149 LOG.info("Readding third server=" + 150 UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 151 LOG.info("Added fourth server=" + 152 UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 153 waitOnCrashProcessing(); 154 waitForAllRegionsAssigned(); 155 assert(UTIL.getHBaseCluster().getMaster().balance() == true); 156 assertRegionsAreBalanced(); 157 for (int i = 0; i < 6; i++){ 158 LOG.info("Adding " + (i + 5) + "th region server"); 159 UTIL.getHBaseCluster().startRegionServer(); 160 } 161 waitForAllRegionsAssigned(); 162 assert(UTIL.getHBaseCluster().getMaster().balance() == true); 163 assertRegionsAreBalanced(); 164 regionLocator.close(); 165 } 166 } 167 168 /** 169 * Wait on crash processing. Balancer won't run if processing a crashed server. 170 */ 171 private void waitOnCrashProcessing() { 172 while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) { 173 LOG.info("Waiting on processing of crashed server before proceeding..."); 174 Threads.sleep(1000); 175 } 176 } 177 178 /** 179 * Determine if regions are balanced. Figure out the total, divide by the 180 * number of online servers, then test if each server is +/- 1 of average 181 * rounded up. 182 */ 183 private void assertRegionsAreBalanced() throws IOException { 184 // TODO: Fix this test. Old balancer used to run with 'slop'. New 185 // balancer does not. 186 boolean success = false; 187 float slop = (float)UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f); 188 if (slop <= 0) slop = 1; 189 190 for (int i = 0; i < 5; i++) { 191 success = true; 192 // make sure all the regions are reassigned before we test balance 193 waitForAllRegionsAssigned(); 194 195 long regionCount = UTIL.getMiniHBaseCluster().countServedRegions(); 196 List<HRegionServer> servers = getOnlineRegionServers(); 197 double avg = (double)regionCount / (double)servers.size(); 198 int avgLoadPlusSlop = (int)Math.ceil(avg * (1 + slop)); 199 int avgLoadMinusSlop = (int)Math.floor(avg * (1 - slop)) - 1; 200 // Increase the margin a little to accommodate StochasticLoadBalancer 201 if (this.balancerName.contains("StochasticLoadBalancer")) { 202 avgLoadPlusSlop++; 203 avgLoadMinusSlop--; 204 } 205 LOG.debug("There are " + servers.size() + " servers and " + regionCount 206 + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop 207 + ", up border: " + avgLoadPlusSlop + "; attempt: " + i); 208 209 for (HRegionServer server : servers) { 210 int serverLoad = 211 ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size(); 212 LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad); 213 if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop 214 && serverLoad >= avgLoadMinusSlop)) { 215 for (RegionInfo hri : 216 ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) { 217 if (hri.isMetaRegion()) serverLoad--; 218 // LOG.debug(hri.getRegionNameAsString()); 219 } 220 if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { 221 LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + 222 " actual: " + serverLoad + " slop: " + slop); 223 success = false; 224 break; 225 } 226 } 227 } 228 229 if (!success) { 230 // one or more servers are not balanced. sleep a little to give it a 231 // chance to catch up. then, go back to the retry loop. 232 try { 233 Thread.sleep(10000); 234 } catch (InterruptedException e) {} 235 236 UTIL.getHBaseCluster().getMaster().balance(); 237 continue; 238 } 239 240 // if we get here, all servers were balanced, so we should just return. 241 return; 242 } 243 // if we get here, we tried 5 times and never got to short circuit out of 244 // the retry loop, so this is a failure. 245 fail("After 5 attempts, region assignments were not balanced."); 246 } 247 248 private List<HRegionServer> getOnlineRegionServers() { 249 List<HRegionServer> list = new ArrayList<>(); 250 for (JVMClusterUtil.RegionServerThread rst : 251 UTIL.getHBaseCluster().getRegionServerThreads()) { 252 if (rst.getRegionServer().isOnline()) { 253 list.add(rst.getRegionServer()); 254 } 255 } 256 return list; 257 } 258 259 /** 260 * Wait until all the regions are assigned. 261 */ 262 private void waitForAllRegionsAssigned() throws IOException { 263 int totalRegions = HBaseTestingUtility.KEYS.length; 264 try { 265 Thread.sleep(200); 266 } catch (InterruptedException e) { 267 throw new InterruptedIOException(); 268 } 269 while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) { 270 // while (!cluster.getMaster().allRegionsAssigned()) { 271 LOG.debug("Waiting for there to be "+ totalRegions +" regions, but there are " 272 + UTIL.getMiniHBaseCluster().countServedRegions() + " right now."); 273 try { 274 Thread.sleep(200); 275 } catch (InterruptedException e) { 276 throw new InterruptedIOException(); 277 } 278 } 279 UTIL.waitUntilNoRegionsInTransition(); 280 } 281 282} 283