001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.rsgroup; 019 020import static org.junit.jupiter.api.Assertions.assertEquals; 021import static org.junit.jupiter.api.Assertions.assertFalse; 022 023import org.apache.hadoop.hbase.HBaseClusterInterface; 024import org.apache.hadoop.hbase.HBaseTestingUtil; 025import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 026import org.apache.hadoop.hbase.StartTestingClusterOption; 027import org.apache.hadoop.hbase.TableName; 028import org.apache.hadoop.hbase.Waiter; 029import org.apache.hadoop.hbase.client.Admin; 030import org.apache.hadoop.hbase.client.RegionInfo; 031import org.apache.hadoop.hbase.master.HMaster; 032import org.apache.hadoop.hbase.master.ServerManager; 033import org.apache.hadoop.hbase.regionserver.HRegionServer; 034import org.apache.hadoop.hbase.testclassification.MediumTests; 035import org.apache.hadoop.hbase.testclassification.RSGroupTests; 036import org.apache.hadoop.hbase.util.Bytes; 037import org.junit.jupiter.api.AfterAll; 038import org.junit.jupiter.api.BeforeAll; 039import org.junit.jupiter.api.Tag; 040import org.junit.jupiter.api.Test; 041import org.junit.jupiter.api.TestInfo; 042import org.slf4j.Logger; 043import org.slf4j.LoggerFactory; 044 045import org.apache.hbase.thirdparty.com.google.common.collect.Sets; 046 047/** 048 * This tests that GroupBasedBalancer will use data in zk to do balancing during master startup. 049 * This does not test retain assignment. 050 * <p/> 051 * The tests brings up 3 RS, creates a new RS group 'my_group', moves 1 RS to 'my_group', assigns 052 * 'hbase:rsgroup' to 'my_group', and kill the only server in that group so that 'hbase:rsgroup' 053 * table isn't available. It then kills the active master and waits for backup master to come 054 * online. In new master, RSGroupInfoManagerImpl gets the data from zk and waits for the expected 055 * assignment with a timeout. 056 */ 057@Tag(RSGroupTests.TAG) 058@Tag(MediumTests.TAG) 059public class TestRSGroupsOfflineMode extends TestRSGroupsBase { 060 061 private static final Logger LOG = LoggerFactory.getLogger(TestRSGroupsOfflineMode.class); 062 private static HMaster master; 063 private static Admin hbaseAdmin; 064 private static HBaseTestingUtil TEST_UTIL; 065 private static HBaseClusterInterface cluster; 066 private final static long WAIT_TIMEOUT = 60000 * 5; 067 068 @BeforeAll 069 public static void setUp() throws Exception { 070 TEST_UTIL = new HBaseTestingUtil(); 071 RSGroupUtil.enableRSGroup(TEST_UTIL.getConfiguration()); 072 TEST_UTIL.getConfiguration().set(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, "1"); 073 StartTestingClusterOption option = 074 StartTestingClusterOption.builder().numMasters(2).numRegionServers(3).numDataNodes(3).build(); 075 TEST_UTIL.startMiniCluster(option); 076 cluster = TEST_UTIL.getHBaseCluster(); 077 master = ((SingleProcessHBaseCluster) cluster).getMaster(); 078 master.balanceSwitch(false); 079 hbaseAdmin = TEST_UTIL.getAdmin(); 080 // wait till the balancer is in online mode 081 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 082 @Override 083 public boolean evaluate() throws Exception { 084 return master.isInitialized() 085 && ((RSGroupBasedLoadBalancer) master.getLoadBalancer()).isOnline() 086 && master.getServerManager().getOnlineServersList().size() >= 3; 087 } 088 }); 089 } 090 091 @AfterAll 092 public static void tearDown() throws Exception { 093 TEST_UTIL.shutdownMiniCluster(); 094 } 095 096 @Test 097 public void testOffline(TestInfo testInfo) throws Exception, InterruptedException { 098 // Table should be after group table name so it gets assigned later. 099 final TableName failoverTable = 100 TableName.valueOf(getNameWithoutIndex(testInfo.getTestMethod().get().getName())); 101 TEST_UTIL.createTable(failoverTable, Bytes.toBytes("f")); 102 final HRegionServer killRS = ((SingleProcessHBaseCluster) cluster).getRegionServer(0); 103 final HRegionServer groupRS = ((SingleProcessHBaseCluster) cluster).getRegionServer(1); 104 final HRegionServer failoverRS = ((SingleProcessHBaseCluster) cluster).getRegionServer(2); 105 String newGroup = "my_group"; 106 Admin admin = TEST_UTIL.getAdmin(); 107 admin.addRSGroup(newGroup); 108 if ( 109 master.getAssignmentManager().getRegionStates().getRegionAssignments() 110 .containsValue(failoverRS.getServerName()) 111 ) { 112 for (RegionInfo regionInfo : hbaseAdmin.getRegions(failoverRS.getServerName())) { 113 hbaseAdmin.move(regionInfo.getEncodedNameAsBytes(), failoverRS.getServerName()); 114 } 115 LOG.info("Waiting for region unassignments on failover RS..."); 116 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 117 @Override 118 public boolean evaluate() throws Exception { 119 return !master.getServerManager().getLoad(failoverRS.getServerName()).getRegionMetrics() 120 .isEmpty(); 121 } 122 }); 123 } 124 125 // Move server to group and make sure all tables are assigned. 126 admin.moveServersToRSGroup(Sets.newHashSet(groupRS.getServerName().getAddress()), newGroup); 127 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 128 @Override 129 public boolean evaluate() throws Exception { 130 return groupRS.getNumberOfOnlineRegions() < 1 131 && master.getAssignmentManager().getRegionsInTransitionCount() < 1; 132 } 133 }); 134 // Move table to group and wait. 135 admin.setRSGroup(Sets.newHashSet(RSGroupInfoManagerImpl.RSGROUP_TABLE_NAME), newGroup); 136 LOG.info("Waiting for move table..."); 137 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 138 @Override 139 public boolean evaluate() throws Exception { 140 return groupRS.getNumberOfOnlineRegions() == 1; 141 } 142 }); 143 144 groupRS.stop("die"); 145 // Race condition here. 146 TEST_UTIL.getHBaseCluster().getMaster().stopMaster(); 147 LOG.info("Waiting for offline mode..."); 148 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 149 @Override 150 public boolean evaluate() throws Exception { 151 return TEST_UTIL.getHBaseCluster().getMaster() != null 152 && TEST_UTIL.getHBaseCluster().getMaster().isActiveMaster() 153 && TEST_UTIL.getHBaseCluster().getMaster().isInitialized() 154 && TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServers().size() 155 <= 3; 156 } 157 }); 158 159 // Get groupInfoManager from the new active master. 160 RSGroupInfoManager groupMgr = 161 ((SingleProcessHBaseCluster) cluster).getMaster().getRSGroupInfoManager(); 162 // Make sure balancer is in offline mode, since this is what we're testing. 163 assertFalse(groupMgr.isOnline()); 164 // Kill final regionserver to see the failover happens for all tables except GROUP table since 165 // it's group does not have any online RS. 166 killRS.stop("die"); 167 master = TEST_UTIL.getHBaseCluster().getMaster(); 168 LOG.info("Waiting for new table assignment..."); 169 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 170 @Override 171 public boolean evaluate() throws Exception { 172 return failoverRS.getRegions(failoverTable).size() >= 1; 173 } 174 }); 175 assertEquals(0, failoverRS.getRegions(RSGroupInfoManagerImpl.RSGROUP_TABLE_NAME).size()); 176 177 // Need this for minicluster to shutdown cleanly. 178 master.stopMaster(); 179 } 180}