001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.rsgroup; 019 020import static org.apache.hadoop.hbase.util.Threads.sleep; 021import static org.junit.jupiter.api.Assertions.assertEquals; 022import static org.junit.jupiter.api.Assertions.assertFalse; 023import static org.junit.jupiter.api.Assertions.assertTrue; 024import static org.junit.jupiter.api.Assertions.fail; 025 026import java.io.IOException; 027import java.util.ArrayList; 028import java.util.EnumSet; 029import java.util.Iterator; 030import java.util.List; 031import java.util.Map; 032import java.util.Random; 033import java.util.Set; 034import java.util.concurrent.ThreadLocalRandom; 035import java.util.concurrent.atomic.AtomicBoolean; 036import java.util.function.Function; 037import org.apache.hadoop.hbase.ClusterMetrics.Option; 038import org.apache.hadoop.hbase.ServerName; 039import org.apache.hadoop.hbase.TableName; 040import org.apache.hadoop.hbase.Waiter; 041import org.apache.hadoop.hbase.client.RegionInfo; 042import org.apache.hadoop.hbase.constraint.ConstraintException; 043import org.apache.hadoop.hbase.master.RegionState; 044import org.apache.hadoop.hbase.master.assignment.RegionStateNode; 045import org.apache.hadoop.hbase.net.Address; 046import org.apache.hadoop.hbase.testclassification.LargeTests; 047import org.apache.hadoop.hbase.testclassification.RSGroupTests; 048import org.apache.hadoop.hbase.util.Bytes; 049import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 050import org.apache.hadoop.hbase.util.Pair; 051import org.junit.jupiter.api.AfterAll; 052import org.junit.jupiter.api.AfterEach; 053import org.junit.jupiter.api.BeforeAll; 054import org.junit.jupiter.api.BeforeEach; 055import org.junit.jupiter.api.Tag; 056import org.junit.jupiter.api.Test; 057import org.junit.jupiter.api.TestInfo; 058import org.slf4j.Logger; 059import org.slf4j.LoggerFactory; 060 061import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 062import org.apache.hbase.thirdparty.com.google.common.collect.Sets; 063 064@Tag(RSGroupTests.TAG) 065@Tag(LargeTests.TAG) 066public class TestRSGroupsAdmin2 extends TestRSGroupsBase { 067 068 private static final Logger LOG = LoggerFactory.getLogger(TestRSGroupsAdmin2.class); 069 070 @BeforeAll 071 public static void setUp() throws Exception { 072 setUpTestBeforeClass(); 073 } 074 075 @AfterAll 076 public static void tearDown() throws Exception { 077 tearDownAfterClass(); 078 } 079 080 @BeforeEach 081 public void beforeMethod(TestInfo testInfo) throws Exception { 082 setUpBeforeMethod(testInfo); 083 } 084 085 @AfterEach 086 public void afterMethod() throws Exception { 087 tearDownAfterMethod(); 088 } 089 090 @Test 091 public void testRegionMove() throws Exception { 092 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 093 final byte[] familyNameBytes = Bytes.toBytes("f"); 094 // All the regions created below will be assigned to the default group. 095 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, 6); 096 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 097 @Override 098 public boolean evaluate() throws Exception { 099 List<String> regions = getTableRegionMap().get(tableName); 100 if (regions == null) { 101 return false; 102 } 103 104 return getTableRegionMap().get(tableName).size() >= 6; 105 } 106 }); 107 108 // get target region to move 109 Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName); 110 String targetRegion = null; 111 for (ServerName server : assignMap.keySet()) { 112 targetRegion = assignMap.get(server).size() > 0 ? assignMap.get(server).get(0) : null; 113 if (targetRegion != null) { 114 break; 115 } 116 } 117 // get server which is not a member of new group 118 ServerName tmpTargetServer = null; 119 for (ServerName server : ADMIN.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)) 120 .getLiveServerMetrics().keySet()) { 121 if (!newGroup.containsServer(server.getAddress())) { 122 tmpTargetServer = server; 123 break; 124 } 125 } 126 final ServerName targetServer = tmpTargetServer; 127 // move target server to group 128 ADMIN.moveServersToRSGroup(Sets.newHashSet(targetServer.getAddress()), newGroup.getName()); 129 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 130 @Override 131 public boolean evaluate() throws Exception { 132 return ADMIN.getRegions(targetServer).size() <= 0; 133 } 134 }); 135 136 // Lets move this region to the new group. 137 TEST_UTIL.getAdmin() 138 .move(Bytes.toBytes(RegionInfo.encodeRegionName(Bytes.toBytes(targetRegion))), targetServer); 139 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 140 @Override 141 public boolean evaluate() throws Exception { 142 return getTableRegionMap().get(tableName) != null 143 && getTableRegionMap().get(tableName).size() == 6 144 && ADMIN.getClusterMetrics(EnumSet.of(Option.REGIONS_IN_TRANSITION)) 145 .getRegionStatesInTransition().size() < 1; 146 } 147 }); 148 149 // verify that targetServer didn't open it 150 for (RegionInfo region : ADMIN.getRegions(targetServer)) { 151 if (targetRegion.equals(region.getRegionNameAsString())) { 152 fail("Target server opened region"); 153 } 154 } 155 } 156 157 @Test 158 public void testRegionServerMove() throws IOException, InterruptedException { 159 int initNumGroups = ADMIN.listRSGroups().size(); 160 RSGroupInfo appInfo = addGroup(getGroupName(name.getMethodName()), 1); 161 RSGroupInfo adminInfo = addGroup(getGroupName(name.getMethodName()), 1); 162 RSGroupInfo dInfo = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP); 163 assertEquals(initNumGroups + 2, ADMIN.listRSGroups().size()); 164 assertEquals(1, adminInfo.getServers().size()); 165 assertEquals(1, appInfo.getServers().size()); 166 assertEquals(getNumServers() - 2, dInfo.getServers().size()); 167 ADMIN.moveServersToRSGroup(appInfo.getServers(), RSGroupInfo.DEFAULT_GROUP); 168 ADMIN.removeRSGroup(appInfo.getName()); 169 ADMIN.moveServersToRSGroup(adminInfo.getServers(), RSGroupInfo.DEFAULT_GROUP); 170 ADMIN.removeRSGroup(adminInfo.getName()); 171 assertEquals(ADMIN.listRSGroups().size(), initNumGroups); 172 } 173 174 @Test 175 public void testMoveServers() throws Exception { 176 // create groups and assign servers 177 addGroup("bar", 3); 178 ADMIN.addRSGroup("foo"); 179 180 RSGroupInfo barGroup = ADMIN.getRSGroup("bar"); 181 RSGroupInfo fooGroup = ADMIN.getRSGroup("foo"); 182 assertEquals(3, barGroup.getServers().size()); 183 assertEquals(0, fooGroup.getServers().size()); 184 185 // test fail bogus server move 186 try { 187 ADMIN.moveServersToRSGroup(Sets.newHashSet(Address.fromString("foo:9999")), "foo"); 188 fail("Bogus servers shouldn't have been successfully moved."); 189 } catch (IOException ex) { 190 String exp = "Server foo:9999 is either offline or it does not exist."; 191 String msg = "Expected '" + exp + "' in exception message: "; 192 assertTrue(ex.getMessage().contains(exp), msg + " " + ex.getMessage()); 193 } 194 195 // test success case 196 LOG.info("moving servers " + barGroup.getServers() + " to group foo"); 197 ADMIN.moveServersToRSGroup(barGroup.getServers(), fooGroup.getName()); 198 199 barGroup = ADMIN.getRSGroup("bar"); 200 fooGroup = ADMIN.getRSGroup("foo"); 201 assertEquals(0, barGroup.getServers().size()); 202 assertEquals(3, fooGroup.getServers().size()); 203 204 LOG.info("moving servers " + fooGroup.getServers() + " to group default"); 205 ADMIN.moveServersToRSGroup(fooGroup.getServers(), RSGroupInfo.DEFAULT_GROUP); 206 207 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 208 @Override 209 public boolean evaluate() throws Exception { 210 return getNumServers() == ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size(); 211 } 212 }); 213 214 fooGroup = ADMIN.getRSGroup("foo"); 215 assertEquals(0, fooGroup.getServers().size()); 216 217 // test group removal 218 LOG.info("Remove group " + barGroup.getName()); 219 ADMIN.removeRSGroup(barGroup.getName()); 220 assertEquals(null, ADMIN.getRSGroup(barGroup.getName())); 221 LOG.info("Remove group " + fooGroup.getName()); 222 ADMIN.removeRSGroup(fooGroup.getName()); 223 assertEquals(null, ADMIN.getRSGroup(fooGroup.getName())); 224 } 225 226 @Test 227 public void testRemoveServers() throws Exception { 228 LOG.info("testRemoveServers"); 229 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3); 230 Iterator<Address> iterator = newGroup.getServers().iterator(); 231 ServerName targetServer = getServerName(iterator.next()); 232 233 // remove online servers 234 try { 235 ADMIN.removeServersFromRSGroup(Sets.newHashSet(targetServer.getAddress())); 236 fail("Online servers shouldn't have been successfully removed."); 237 } catch (IOException ex) { 238 String exp = 239 "Server " + targetServer.getAddress() + " is an online server, not allowed to remove."; 240 String msg = "Expected '" + exp + "' in exception message: "; 241 assertTrue(ex.getMessage().contains(exp), msg + " " + ex.getMessage()); 242 } 243 assertTrue(newGroup.getServers().contains(targetServer.getAddress())); 244 245 // remove dead servers 246 NUM_DEAD_SERVERS = CLUSTER.getClusterMetrics().getDeadServerNames().size(); 247 try { 248 // stopping may cause an exception 249 // due to the connection loss 250 LOG.info("stopping server " + targetServer.getServerName()); 251 ADMIN.stopRegionServer(targetServer.getAddress().toString()); 252 NUM_DEAD_SERVERS++; 253 } catch (Exception e) { 254 } 255 256 // wait for stopped regionserver to dead server list 257 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 258 @Override 259 public boolean evaluate() throws Exception { 260 return !MASTER.getServerManager().areDeadServersInProgress() 261 && CLUSTER.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS; 262 } 263 }); 264 265 try { 266 ADMIN.removeServersFromRSGroup(Sets.newHashSet(targetServer.getAddress())); 267 fail("Dead servers shouldn't have been successfully removed."); 268 } catch (IOException ex) { 269 String exp = "Server " + targetServer.getAddress() + " is on the dead servers list," 270 + " Maybe it will come back again, not allowed to remove."; 271 String msg = "Expected '" + exp + "' in exception message: "; 272 assertTrue(ex.getMessage().contains(exp), msg + " " + ex.getMessage()); 273 } 274 assertTrue(newGroup.getServers().contains(targetServer.getAddress())); 275 276 // remove decommissioned servers 277 List<ServerName> serversToDecommission = new ArrayList<>(); 278 targetServer = getServerName(iterator.next()); 279 assertTrue(MASTER.getServerManager().getOnlineServers().containsKey(targetServer)); 280 serversToDecommission.add(targetServer); 281 282 ADMIN.decommissionRegionServers(serversToDecommission, true); 283 assertEquals(1, ADMIN.listDecommissionedRegionServers().size()); 284 285 assertTrue(newGroup.getServers().contains(targetServer.getAddress())); 286 ADMIN.removeServersFromRSGroup(Sets.newHashSet(targetServer.getAddress())); 287 Set<Address> newGroupServers = ADMIN.getRSGroup(newGroup.getName()).getServers(); 288 assertFalse(newGroupServers.contains(targetServer.getAddress())); 289 assertEquals(2, newGroupServers.size()); 290 291 assertTrue(OBSERVER.preRemoveServersCalled); 292 assertTrue(OBSERVER.postRemoveServersCalled); 293 } 294 295 @Test 296 public void testMoveServersAndTables() throws Exception { 297 LOG.info("testMoveServersAndTables"); 298 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 299 // create table 300 final byte[] familyNameBytes = Bytes.toBytes("f"); 301 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, 5); 302 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 303 @Override 304 public boolean evaluate() throws Exception { 305 List<String> regions = getTableRegionMap().get(tableName); 306 if (regions == null) { 307 return false; 308 } 309 310 return getTableRegionMap().get(tableName).size() >= 5; 311 } 312 }); 313 314 // get server which is not a member of new group 315 ServerName targetServer = null; 316 for (ServerName server : ADMIN.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)) 317 .getLiveServerMetrics().keySet()) { 318 if ( 319 !newGroup.containsServer(server.getAddress()) 320 && !ADMIN.getRSGroup("master").containsServer(server.getAddress()) 321 ) { 322 targetServer = server; 323 break; 324 } 325 } 326 327 LOG.debug("Print group info : " + ADMIN.listRSGroups()); 328 int oldDefaultGroupServerSize = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size(); 329 int oldDefaultGroupTableSize = ADMIN.listTablesInRSGroup(RSGroupInfo.DEFAULT_GROUP).size(); 330 assertTrue(OBSERVER.preListTablesInRSGroupCalled); 331 assertTrue(OBSERVER.postListTablesInRSGroupCalled); 332 333 // test fail bogus server move 334 try { 335 ADMIN.moveServersToRSGroup(Sets.newHashSet(Address.fromString("foo:9999")), 336 newGroup.getName()); 337 ADMIN.setRSGroup(Sets.newHashSet(tableName), newGroup.getName()); 338 fail("Bogus servers shouldn't have been successfully moved."); 339 } catch (IOException ex) { 340 String exp = "Server foo:9999 is either offline or it does not exist."; 341 String msg = "Expected '" + exp + "' in exception message: "; 342 assertTrue(ex.getMessage().contains(exp), msg + " " + ex.getMessage()); 343 } 344 345 // test move when src = dst 346 ADMIN.moveServersToRSGroup(Sets.newHashSet(targetServer.getAddress()), 347 RSGroupInfo.DEFAULT_GROUP); 348 ADMIN.setRSGroup(Sets.newHashSet(tableName), RSGroupInfo.DEFAULT_GROUP); 349 350 // verify default group info 351 assertEquals(oldDefaultGroupServerSize, 352 ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size()); 353 assertEquals(oldDefaultGroupTableSize, 354 ADMIN.listTablesInRSGroup(RSGroupInfo.DEFAULT_GROUP).size()); 355 356 // verify new group info 357 assertEquals(1, ADMIN.getRSGroup(newGroup.getName()).getServers().size()); 358 assertEquals(0, 359 ADMIN.getConfiguredNamespacesAndTablesInRSGroup(newGroup.getName()).getSecond().size()); 360 assertTrue(OBSERVER.preGetConfiguredNamespacesAndTablesInRSGroupCalled); 361 assertTrue(OBSERVER.postGetConfiguredNamespacesAndTablesInRSGroupCalled); 362 363 // get all region to move targetServer 364 List<String> regionList = getTableRegionMap().get(tableName); 365 for (String region : regionList) { 366 // Lets move this region to the targetServer 367 TEST_UTIL.getAdmin().move(Bytes.toBytes(RegionInfo.encodeRegionName(Bytes.toBytes(region))), 368 targetServer); 369 } 370 371 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 372 @Override 373 public boolean evaluate() throws Exception { 374 return getTableRegionMap().get(tableName) != null 375 && getTableRegionMap().get(tableName).size() == 5 376 && getTableServerRegionMap().get(tableName).size() == 1 377 && ADMIN.getClusterMetrics(EnumSet.of(Option.REGIONS_IN_TRANSITION)) 378 .getRegionStatesInTransition().size() < 1; 379 } 380 }); 381 382 // verify that all region move to targetServer 383 assertEquals(5, getTableServerRegionMap().get(tableName).get(targetServer).size()); 384 385 // move targetServer and table to newGroup 386 LOG.info("moving server and table to newGroup"); 387 ADMIN.moveServersToRSGroup(Sets.newHashSet(targetServer.getAddress()), newGroup.getName()); 388 ADMIN.setRSGroup(Sets.newHashSet(tableName), newGroup.getName()); 389 390 // verify group change 391 assertEquals(newGroup.getName(), ADMIN.getRSGroup(tableName).getName()); 392 393 // verify servers' not exist in old group 394 Set<Address> defaultServers = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers(); 395 assertFalse(defaultServers.contains(targetServer.getAddress())); 396 397 // verify servers' exist in new group 398 Set<Address> newGroupServers = ADMIN.getRSGroup(newGroup.getName()).getServers(); 399 assertTrue(newGroupServers.contains(targetServer.getAddress())); 400 401 // verify tables' not exist in old group 402 Set<TableName> defaultTables = 403 Sets.newHashSet(ADMIN.listTablesInRSGroup(RSGroupInfo.DEFAULT_GROUP)); 404 assertFalse(defaultTables.contains(tableName)); 405 406 // verify tables' exist in new group 407 Set<TableName> newGroupTables = Sets 408 .newHashSet(ADMIN.getConfiguredNamespacesAndTablesInRSGroup(newGroup.getName()).getSecond()); 409 assertTrue(newGroupTables.contains(tableName)); 410 411 // verify that all region still assign on targetServer 412 // TODO: uncomment after we reimplement moveServersAndTables, now the implementation is 413 // moveToRSGroup first and then moveTables, so the region will be moved to other region servers. 414 // assertEquals(5, getTableServerRegionMap().get(tableName).get(targetServer).size()); 415 416 assertTrue(OBSERVER.preMoveServersCalled); 417 assertTrue(OBSERVER.postMoveServersCalled); 418 } 419 420 @Test 421 public void testMoveServersFromDefaultGroup() throws Exception { 422 // create groups and assign servers 423 ADMIN.addRSGroup("foo"); 424 425 RSGroupInfo fooGroup = ADMIN.getRSGroup("foo"); 426 assertEquals(0, fooGroup.getServers().size()); 427 RSGroupInfo defaultGroup = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP); 428 429 // test remove all servers from default 430 try { 431 ADMIN.moveServersToRSGroup(defaultGroup.getServers(), fooGroup.getName()); 432 fail(RSGroupInfoManagerImpl.KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE); 433 } catch (ConstraintException ex) { 434 assertTrue( 435 ex.getMessage().contains(RSGroupInfoManagerImpl.KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE)); 436 } 437 438 // test success case, remove one server from default ,keep at least one server 439 if (defaultGroup.getServers().size() > 1) { 440 Address serverInDefaultGroup = defaultGroup.getServers().iterator().next(); 441 LOG.info("moving server " + serverInDefaultGroup + " from group default to group " 442 + fooGroup.getName()); 443 ADMIN.moveServersToRSGroup(Sets.newHashSet(serverInDefaultGroup), fooGroup.getName()); 444 } 445 446 fooGroup = ADMIN.getRSGroup("foo"); 447 LOG.info("moving servers " + fooGroup.getServers() + " to group default"); 448 ADMIN.moveServersToRSGroup(fooGroup.getServers(), RSGroupInfo.DEFAULT_GROUP); 449 450 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 451 @Override 452 public boolean evaluate() throws Exception { 453 return getNumServers() == ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size(); 454 } 455 }); 456 457 fooGroup = ADMIN.getRSGroup("foo"); 458 assertEquals(0, fooGroup.getServers().size()); 459 460 // test group removal 461 LOG.info("Remove group " + fooGroup.getName()); 462 ADMIN.removeRSGroup(fooGroup.getName()); 463 assertEquals(null, ADMIN.getRSGroup(fooGroup.getName())); 464 } 465 466 @Test 467 public void testFailedMoveBeforeRetryExhaustedWhenMoveServer() throws Exception { 468 String groupName = getGroupName(name.getMethodName()); 469 ADMIN.addRSGroup(groupName); 470 final RSGroupInfo newGroup = ADMIN.getRSGroup(groupName); 471 Pair<ServerName, RegionStateNode> gotPair = createTableWithRegionSplitting(newGroup, 10); 472 473 // start thread to recover region state 474 final ServerName movedServer = gotPair.getFirst(); 475 final RegionStateNode rsn = gotPair.getSecond(); 476 AtomicBoolean changed = new AtomicBoolean(false); 477 Thread t1 = recoverRegionStateThread(movedServer, 478 server -> MASTER.getAssignmentManager().getRegionsOnServer(movedServer), rsn, changed); 479 t1.start(); 480 481 // move target server to group 482 Thread t2 = new Thread(() -> { 483 LOG.info("thread2 start running, to move regions"); 484 try { 485 ADMIN.moveServersToRSGroup(Sets.newHashSet(movedServer.getAddress()), newGroup.getName()); 486 } catch (IOException e) { 487 LOG.error("move server error", e); 488 } 489 }); 490 t2.start(); 491 492 t1.join(); 493 t2.join(); 494 495 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 496 @Override 497 public boolean evaluate() { 498 if (changed.get()) { 499 return MASTER.getAssignmentManager().getRegionsOnServer(movedServer).size() == 0 500 && !rsn.getRegionLocation().equals(movedServer); 501 } 502 return false; 503 } 504 }); 505 } 506 507 private <T> Thread recoverRegionStateThread(T owner, Function<T, List<RegionInfo>> getRegions, 508 RegionStateNode rsn, AtomicBoolean changed) { 509 return new Thread(() -> { 510 LOG.info("thread1 start running, will recover region state"); 511 long current = EnvironmentEdgeManager.currentTime(); 512 // wait until there is only left the region we changed state and recover its state. 513 // wait time is set according to the number of max retries, all except failed regions will be 514 // moved in one retry, and will sleep 1s until next retry. 515 while ( 516 EnvironmentEdgeManager.currentTime() - current 517 <= RSGroupInfoManagerImpl.DEFAULT_MAX_RETRY_VALUE * 1000 518 ) { 519 List<RegionInfo> regions = getRegions.apply(owner); 520 LOG.debug("server table region size is:{}", regions.size()); 521 assert regions.size() >= 1; 522 // when there is exactly one region left, we can determine the move operation encountered 523 // exception caused by the strange region state. 524 if (regions.size() == 1) { 525 assertEquals(regions.get(0).getRegionNameAsString(), 526 rsn.getRegionInfo().getRegionNameAsString()); 527 rsn.setState(RegionState.State.OPEN); 528 LOG.info("set region {} state OPEN", rsn.getRegionInfo().getRegionNameAsString()); 529 changed.set(true); 530 break; 531 } 532 sleep(5000); 533 } 534 }); 535 } 536 537 private Pair<ServerName, RegionStateNode> createTableWithRegionSplitting(RSGroupInfo rsGroupInfo, 538 int tableRegionCount) throws Exception { 539 final byte[] familyNameBytes = Bytes.toBytes("f"); 540 // All the regions created below will be assigned to the default group. 541 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); 542 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 543 @Override 544 public boolean evaluate() throws Exception { 545 List<String> regions = getTableRegionMap().get(tableName); 546 if (regions == null) { 547 return false; 548 } 549 return getTableRegionMap().get(tableName).size() >= tableRegionCount; 550 } 551 }); 552 553 return randomlySetOneRegionStateToSplitting(rsGroupInfo); 554 } 555 556 /** 557 * Randomly choose a region to set state. 558 * @param newGroup target group 559 * @return source server of region, and region state 560 * @throws IOException if methods called throw 561 */ 562 private Pair<ServerName, RegionStateNode> 563 randomlySetOneRegionStateToSplitting(RSGroupInfo newGroup) throws IOException { 564 // get target server to move, which should has more than one regions 565 // randomly set a region state to SPLITTING to make move fail 566 return randomlySetRegionState(newGroup, RegionState.State.SPLITTING, tableName); 567 } 568 569 private Pair<ServerName, RegionStateNode> randomlySetRegionState(RSGroupInfo groupInfo, 570 RegionState.State state, TableName... tableNames) throws IOException { 571 Preconditions.checkArgument(tableNames.length == 1 || tableNames.length == 2, 572 "only support one or two tables"); 573 Map<TableName, Map<ServerName, List<String>>> tableServerRegionMap = getTableServerRegionMap(); 574 Map<ServerName, List<String>> assignMap = tableServerRegionMap.get(tableNames[0]); 575 if (tableNames.length == 2) { 576 Map<ServerName, List<String>> assignMap2 = tableServerRegionMap.get(tableNames[1]); 577 assignMap2.forEach((k, v) -> { 578 if (!assignMap.containsKey(k)) { 579 assignMap.remove(k); 580 } 581 }); 582 } 583 String toCorrectRegionName = null; 584 ServerName srcServer = null; 585 for (ServerName server : assignMap.keySet()) { 586 toCorrectRegionName = 587 assignMap.get(server).size() >= 1 && !groupInfo.containsServer(server.getAddress()) 588 ? assignMap.get(server).get(0) 589 : null; 590 if (toCorrectRegionName != null) { 591 srcServer = server; 592 break; 593 } 594 } 595 assert srcServer != null; 596 RegionInfo toCorrectRegionInfo = TEST_UTIL.getMiniHBaseCluster().getMaster() 597 .getAssignmentManager().getRegionInfo(Bytes.toBytesBinary(toCorrectRegionName)); 598 RegionStateNode rsn = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager() 599 .getRegionStates().getRegionStateNode(toCorrectRegionInfo); 600 rsn.setState(state); 601 return new Pair<>(srcServer, rsn); 602 } 603 604 @Test 605 public void testFailedMoveServersAndRepair() throws Exception { 606 // This UT calls moveToRSGroup() twice to test the idempotency of it. 607 // The first time, movement fails because a region is made in SPLITTING state 608 // which will not be moved. 609 // The second time, the region state is OPEN and check if all 610 // regions on target group servers after the call. 611 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 612 613 // create table 614 // randomly set a region state to SPLITTING to make move abort 615 Pair<ServerName, RegionStateNode> gotPair = 616 createTableWithRegionSplitting(newGroup, ThreadLocalRandom.current().nextInt(8) + 4); 617 RegionStateNode rsn = gotPair.getSecond(); 618 ServerName srcServer = rsn.getRegionLocation(); 619 620 // move server to newGroup and check regions 621 try { 622 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 623 fail("should get IOException when retry exhausted but there still exists failed moved " 624 + "regions"); 625 } catch (Exception e) { 626 assertTrue( 627 e.getMessage().contains(gotPair.getSecond().getRegionInfo().getRegionNameAsString())); 628 } 629 for (RegionInfo regionInfo : MASTER.getAssignmentManager().getAssignedRegions()) { 630 if (regionInfo.getTable().equals(tableName) && regionInfo.equals(rsn.getRegionInfo())) { 631 assertEquals( 632 MASTER.getAssignmentManager().getRegionStates().getRegionServerOfRegion(regionInfo), 633 srcServer); 634 } 635 } 636 637 // retry move server to newGroup and check if all regions on srcServer was moved 638 rsn.setState(RegionState.State.OPEN); 639 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 640 assertEquals(MASTER.getAssignmentManager().getRegionsOnServer(srcServer).size(), 0); 641 } 642 643 @Test 644 public void testFailedMoveServersTablesAndRepair() throws Exception { 645 // This UT calls moveTablesAndServers() twice to test the idempotency of it. 646 // The first time, movement fails because a region is made in SPLITTING state 647 // which will not be moved. 648 // The second time, the region state is OPEN and check if all 649 // regions on target group servers after the call. 650 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 651 // create table 652 final byte[] familyNameBytes = Bytes.toBytes("f"); 653 TableName table1 = TableName.valueOf(tableName.getNameAsString() + "_1"); 654 TableName table2 = TableName.valueOf(tableName.getNameAsString() + "_2"); 655 Random rand = ThreadLocalRandom.current(); 656 TEST_UTIL.createMultiRegionTable(table1, familyNameBytes, rand.nextInt(12) + 4); 657 TEST_UTIL.createMultiRegionTable(table2, familyNameBytes, rand.nextInt(12) + 4); 658 659 // randomly set a region state to SPLITTING to make move abort 660 Pair<ServerName, RegionStateNode> gotPair = 661 randomlySetRegionState(newGroup, RegionState.State.SPLITTING, table1, table2); 662 RegionStateNode rsn = gotPair.getSecond(); 663 ServerName srcServer = rsn.getRegionLocation(); 664 665 // move server and table to newGroup and check regions 666 try { 667 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 668 ADMIN.setRSGroup(Sets.newHashSet(table2), newGroup.getName()); 669 fail("should get IOException when retry exhausted but there still exists failed moved " 670 + "regions"); 671 } catch (Exception e) { 672 assertTrue( 673 e.getMessage().contains(gotPair.getSecond().getRegionInfo().getRegionNameAsString())); 674 } 675 for (RegionInfo regionInfo : MASTER.getAssignmentManager().getAssignedRegions()) { 676 if (regionInfo.getTable().equals(table1) && regionInfo.equals(rsn.getRegionInfo())) { 677 assertEquals( 678 MASTER.getAssignmentManager().getRegionStates().getRegionServerOfRegion(regionInfo), 679 srcServer); 680 } 681 } 682 683 // retry moveServersAndTables to newGroup and check if all regions on srcServer belongs to 684 // table2 685 rsn.setState(RegionState.State.OPEN); 686 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 687 ADMIN.setRSGroup(Sets.newHashSet(table2), newGroup.getName()); 688 for (RegionInfo regionsInfo : MASTER.getAssignmentManager().getRegionsOnServer(srcServer)) { 689 assertEquals(regionsInfo.getTable(), table2); 690 } 691 } 692 693 @Test 694 public void testMoveServersToRSGroupPerformance() throws Exception { 695 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 2); 696 final byte[] familyNameBytes = Bytes.toBytes("f"); 697 // there will be 100 regions are both the serves 698 final int tableRegionCount = 200; 699 // All the regions created below will be assigned to the default group. 700 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); 701 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 702 @Override 703 public boolean evaluate() throws Exception { 704 List<String> regions = getTableRegionMap().get(tableName); 705 if (regions == null) { 706 return false; 707 } 708 return getTableRegionMap().get(tableName).size() >= tableRegionCount; 709 } 710 }); 711 ADMIN.setRSGroup(Sets.newHashSet(tableName), newGroup.getName()); 712 TEST_UTIL.waitUntilAllRegionsAssigned(tableName); 713 String rsGroup2 = "rsGroup2"; 714 ADMIN.addRSGroup(rsGroup2); 715 716 long startTime = EnvironmentEdgeManager.currentTime(); 717 ADMIN.moveServersToRSGroup(Sets.newHashSet(newGroup.getServers().iterator().next()), rsGroup2); 718 long timeTaken = EnvironmentEdgeManager.currentTime() - startTime; 719 String msg = 720 "Should not take mote than 15000 ms to move a table with 100 regions. Time taken =" 721 + timeTaken + " ms"; 722 // This test case is meant to be used for verifying the performance quickly by a developer. 723 // Moving 100 regions takes much less than 15000 ms. Given 15000 ms so test cases passes 724 // on all environment. 725 assertTrue(timeTaken < 15000, msg); 726 LOG.info("Time taken to move a table with 100 region is {} ms", timeTaken); 727 } 728}