001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import static org.junit.jupiter.api.Assertions.assertEquals; 021import static org.junit.jupiter.api.Assertions.assertTrue; 022import static org.junit.jupiter.api.Assertions.fail; 023 024import java.io.IOException; 025import java.io.InterruptedIOException; 026import java.util.ArrayList; 027import java.util.Arrays; 028import java.util.List; 029import java.util.stream.Stream; 030import org.apache.hadoop.hbase.client.Admin; 031import org.apache.hadoop.hbase.client.BalanceResponse; 032import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 033import org.apache.hadoop.hbase.client.Connection; 034import org.apache.hadoop.hbase.client.ConnectionFactory; 035import org.apache.hadoop.hbase.client.RegionInfo; 036import org.apache.hadoop.hbase.client.RegionLocator; 037import org.apache.hadoop.hbase.client.TableDescriptor; 038import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 039import org.apache.hadoop.hbase.regionserver.HRegionServer; 040import org.apache.hadoop.hbase.testclassification.FlakeyTests; 041import org.apache.hadoop.hbase.testclassification.LargeTests; 042import org.apache.hadoop.hbase.util.Bytes; 043import org.apache.hadoop.hbase.util.JVMClusterUtil; 044import org.apache.hadoop.hbase.util.Threads; 045import org.junit.jupiter.api.AfterEach; 046import org.junit.jupiter.api.BeforeEach; 047import org.junit.jupiter.api.Tag; 048import org.junit.jupiter.api.TestTemplate; 049import org.junit.jupiter.params.provider.Arguments; 050import org.slf4j.Logger; 051import org.slf4j.LoggerFactory; 052 053import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 054 055/** 056 * Test whether region re-balancing works. (HBASE-71) The test only works for cluster wide 057 * balancing, not per table wide. Increase the margin a little to make StochasticLoadBalancer result 058 * acceptable. 059 */ 060@Tag(FlakeyTests.TAG) 061@Tag(LargeTests.TAG) 062@HBaseParameterizedTestTemplate(name = "{index}: balancer = {0}") 063public class TestRegionRebalancing { 064 065 public static Stream<Arguments> parameters() { 066 return Stream.of(Arguments.of("org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer"), 067 Arguments.of("org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer")); 068 } 069 070 private static final byte[] FAMILY_NAME = Bytes.toBytes("col"); 071 private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class); 072 private final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 073 private RegionLocator regionLocator; 074 private TableDescriptor tableDescriptor; 075 private String balancerName; 076 077 public TestRegionRebalancing(String balancerName) { 078 this.balancerName = balancerName; 079 080 } 081 082 @AfterEach 083 public void after() throws Exception { 084 UTIL.shutdownMiniCluster(); 085 } 086 087 @BeforeEach 088 public void before() throws Exception { 089 UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName); 090 // set minCostNeedBalance to 0, make sure balancer run 091 UTIL.startMiniCluster(1); 092 093 this.tableDescriptor = TableDescriptorBuilder.newBuilder(TableName.valueOf("test")) 094 .setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY_NAME)).build(); 095 } 096 097 /** 098 * For HBASE-71. Try a few different configurations of starting and stopping region servers to see 099 * if the assignment or regions is pretty balanced. 100 */ 101 @TestTemplate 102 public void testRebalanceOnRegionServerNumberChange() throws IOException, InterruptedException { 103 try (Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration()); 104 Admin admin = connection.getAdmin()) { 105 admin.createTable(this.tableDescriptor, 106 Arrays.copyOfRange(HBaseTestingUtil.KEYS, 1, HBaseTestingUtil.KEYS.length)); 107 this.regionLocator = connection.getRegionLocator(this.tableDescriptor.getTableName()); 108 109 MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection()); 110 111 assertEquals(HBaseTestingUtil.KEYS.length, this.regionLocator.getStartKeys().length, 112 "Test table should have right number of regions"); 113 114 // verify that the region assignments are balanced to start out 115 assertRegionsAreBalanced(); 116 117 // add a region server - total of 2 118 LOG.info("Started second server=" 119 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 120 UTIL.getHBaseCluster().getMaster().balance(); 121 assertRegionsAreBalanced(); 122 123 // On a balanced cluster, calling balance() should return true 124 BalanceResponse response = UTIL.getHBaseCluster().getMaster().balance(); 125 assertTrue(response.isBalancerRan()); 126 assertEquals(0, response.getMovesCalculated()); 127 assertEquals(0, response.getMovesExecuted()); 128 129 // if we add a server, then the balance() call should return true 130 // add a region server - total of 3 131 LOG.info("Started third server=" 132 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 133 waitForAllRegionsAssigned(); 134 135 response = UTIL.getHBaseCluster().getMaster().balance(); 136 assertTrue(response.isBalancerRan()); 137 assertTrue(response.getMovesCalculated() > 0); 138 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 139 assertRegionsAreBalanced(); 140 141 // kill a region server - total of 2 142 LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false)); 143 UTIL.getHBaseCluster().waitOnRegionServer(2); 144 waitOnCrashProcessing(); 145 UTIL.getHBaseCluster().getMaster().balance(); 146 assertRegionsAreBalanced(); 147 148 // start two more region servers - total of 4 149 LOG.info("Readding third server=" 150 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 151 LOG.info("Added fourth server=" 152 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 153 waitOnCrashProcessing(); 154 waitForAllRegionsAssigned(); 155 156 response = UTIL.getHBaseCluster().getMaster().balance(); 157 assertTrue(response.isBalancerRan()); 158 assertTrue(response.getMovesCalculated() > 0); 159 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 160 161 assertRegionsAreBalanced(); 162 for (int i = 0; i < 6; i++) { 163 LOG.info("Adding " + (i + 5) + "th region server"); 164 UTIL.getHBaseCluster().startRegionServer(); 165 } 166 waitForAllRegionsAssigned(); 167 168 response = UTIL.getHBaseCluster().getMaster().balance(); 169 assertTrue(response.isBalancerRan()); 170 assertTrue(response.getMovesCalculated() > 0); 171 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 172 173 assertRegionsAreBalanced(); 174 regionLocator.close(); 175 } 176 } 177 178 /** 179 * Wait on crash processing. Balancer won't run if processing a crashed server. 180 */ 181 private void waitOnCrashProcessing() throws IOException { 182 while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) { 183 LOG.info("Waiting on processing of crashed server before proceeding..."); 184 Threads.sleep(1000); 185 } 186 } 187 188 /** 189 * Determine if regions are balanced. Figure out the total, divide by the number of online 190 * servers, then test if each server is +/- 1 of average rounded up. 191 */ 192 private void assertRegionsAreBalanced() throws IOException { 193 // TODO: Fix this test. Old balancer used to run with 'slop'. New 194 // balancer does not. 195 boolean success = false; 196 float slop = (float) UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f); 197 if (slop <= 0) slop = 1; 198 199 for (int i = 0; i < 5; i++) { 200 success = true; 201 // make sure all the regions are reassigned before we test balance 202 waitForAllRegionsAssigned(); 203 204 long regionCount = UTIL.getMiniHBaseCluster().countServedRegions(); 205 List<HRegionServer> servers = getOnlineRegionServers(); 206 double avg = (double) regionCount / (double) servers.size(); 207 int avgLoadPlusSlop = (int) Math.ceil(avg * (1 + slop)); 208 int avgLoadMinusSlop = (int) Math.floor(avg * (1 - slop)) - 1; 209 // Increase the margin a little to accommodate StochasticLoadBalancer 210 if (this.balancerName.contains("StochasticLoadBalancer")) { 211 avgLoadPlusSlop++; 212 avgLoadMinusSlop--; 213 } 214 LOG.debug("There are " + servers.size() + " servers and " + regionCount 215 + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop + ", up border: " 216 + avgLoadPlusSlop + "; attempt: " + i); 217 218 for (HRegionServer server : servers) { 219 int serverLoad = ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size(); 220 LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad); 221 if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { 222 for (RegionInfo hri : ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) { 223 if (hri.isMetaRegion()) serverLoad--; 224 // LOG.debug(hri.getRegionNameAsString()); 225 } 226 if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { 227 LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + " actual: " 228 + serverLoad + " slop: " + slop); 229 success = false; 230 break; 231 } 232 } 233 } 234 235 if (!success) { 236 // one or more servers are not balanced. sleep a little to give it a 237 // chance to catch up. then, go back to the retry loop. 238 try { 239 Thread.sleep(10000); 240 } catch (InterruptedException e) { 241 } 242 243 UTIL.getHBaseCluster().getMaster().balance(); 244 continue; 245 } 246 247 // if we get here, all servers were balanced, so we should just return. 248 return; 249 } 250 // if we get here, we tried 5 times and never got to short circuit out of 251 // the retry loop, so this is a failure. 252 fail("After 5 attempts, region assignments were not balanced."); 253 } 254 255 private List<HRegionServer> getOnlineRegionServers() { 256 List<HRegionServer> list = new ArrayList<>(); 257 for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster().getRegionServerThreads()) { 258 if (rst.getRegionServer().isOnline()) { 259 list.add(rst.getRegionServer()); 260 } 261 } 262 return list; 263 } 264 265 /** 266 * Wait until all the regions are assigned. 267 */ 268 private void waitForAllRegionsAssigned() throws IOException { 269 int totalRegions = HBaseTestingUtil.KEYS.length; 270 try { 271 Thread.sleep(200); 272 } catch (InterruptedException e) { 273 throw new InterruptedIOException(); 274 } 275 while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) { 276 // while (!cluster.getMaster().allRegionsAssigned()) { 277 LOG.debug("Waiting for there to be " + totalRegions + " regions, but there are " 278 + UTIL.getMiniHBaseCluster().countServedRegions() + " right now."); 279 try { 280 Thread.sleep(200); 281 } catch (InterruptedException e) { 282 throw new InterruptedIOException(); 283 } 284 } 285 UTIL.waitUntilNoRegionsInTransition(); 286 } 287 288}