001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021 022import java.io.IOException; 023import java.util.List; 024import java.util.NavigableSet; 025import java.util.Set; 026import java.util.TreeSet; 027import org.apache.hadoop.conf.Configuration; 028import org.apache.hadoop.hbase.HBaseClassTestRule; 029import org.apache.hadoop.hbase.HBaseConfiguration; 030import org.apache.hadoop.hbase.HBaseTestingUtility; 031import org.apache.hadoop.hbase.MiniHBaseCluster; 032import org.apache.hadoop.hbase.ServerName; 033import org.apache.hadoop.hbase.TableName; 034import org.apache.hadoop.hbase.client.RegionInfo; 035import org.apache.hadoop.hbase.client.RegionLocator; 036import org.apache.hadoop.hbase.client.Table; 037import org.apache.hadoop.hbase.testclassification.LargeTests; 038import org.apache.hadoop.hbase.testclassification.MasterTests; 039import org.apache.hadoop.hbase.util.Bytes; 040import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 041import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; 042import org.junit.ClassRule; 043import org.junit.Rule; 044import org.junit.Test; 045import org.junit.experimental.categories.Category; 046import org.junit.rules.TestName; 047import org.slf4j.Logger; 048import org.slf4j.LoggerFactory; 049 050import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 051 052/** 053 * Tests the restarting of everything as done during rolling restarts. 054 */ 055@Category({MasterTests.class, LargeTests.class}) 056public class TestRollingRestart { 057 058 @ClassRule 059 public static final HBaseClassTestRule CLASS_RULE = 060 HBaseClassTestRule.forClass(TestRollingRestart.class); 061 062 private static final Logger LOG = LoggerFactory.getLogger(TestRollingRestart.class); 063 064 @Rule 065 public TestName name = new TestName(); 066 067 @Test 068 public void testBasicRollingRestart() throws Exception { 069 070 // Start a cluster with 2 masters and 4 regionservers 071 final int NUM_MASTERS = 2; 072 final int NUM_RS = 3; 073 final int NUM_REGIONS_TO_CREATE = 20; 074 075 int expectedNumRS = 3; 076 077 // Start the cluster 078 log("Starting cluster"); 079 Configuration conf = HBaseConfiguration.create(); 080 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); 081 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS); 082 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 083 log("Waiting for active/ready master"); 084 cluster.waitForActiveAndReadyMaster(); 085 086 // Create a table with regions 087 final TableName tableName = TableName.valueOf(name.getMethodName()); 088 byte [] family = Bytes.toBytes("family"); 089 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions"); 090 Table ht = TEST_UTIL.createMultiRegionTable(tableName, family, NUM_REGIONS_TO_CREATE); 091 int numRegions = -1; 092 try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(tableName)) { 093 numRegions = r.getStartKeys().length; 094 } 095 numRegions += 1; // catalogs 096 log("Waiting for no more RIT\n"); 097 TEST_UTIL.waitUntilNoRegionsInTransition(60000); 098 log("Disabling table\n"); 099 TEST_UTIL.getAdmin().disableTable(tableName); 100 log("Waiting for no more RIT\n"); 101 TEST_UTIL.waitUntilNoRegionsInTransition(60000); 102 NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster); 103 log("Verifying only catalog and namespace regions are assigned\n"); 104 if (regions.size() != 2) { 105 for (String oregion : regions) log("Region still online: " + oregion); 106 } 107 assertEquals(2, regions.size()); 108 log("Enabling table\n"); 109 TEST_UTIL.getAdmin().enableTable(tableName); 110 log("Waiting for no more RIT\n"); 111 TEST_UTIL.waitUntilNoRegionsInTransition(60000); 112 log("Verifying there are " + numRegions + " assigned on cluster\n"); 113 regions = HBaseTestingUtility.getAllOnlineRegions(cluster); 114 assertRegionsAssigned(cluster, regions); 115 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size()); 116 117 // Add a new regionserver 118 log("Adding a fourth RS"); 119 RegionServerThread restarted = cluster.startRegionServer(); 120 expectedNumRS++; 121 restarted.waitForServerOnline(); 122 log("Additional RS is online"); 123 log("Waiting for no more RIT"); 124 TEST_UTIL.waitUntilNoRegionsInTransition(60000); 125 log("Verifying there are " + numRegions + " assigned on cluster"); 126 assertRegionsAssigned(cluster, regions); 127 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size()); 128 129 // Master Restarts 130 List<MasterThread> masterThreads = cluster.getMasterThreads(); 131 MasterThread activeMaster = null; 132 MasterThread backupMaster = null; 133 assertEquals(2, masterThreads.size()); 134 if (masterThreads.get(0).getMaster().isActiveMaster()) { 135 activeMaster = masterThreads.get(0); 136 backupMaster = masterThreads.get(1); 137 } else { 138 activeMaster = masterThreads.get(1); 139 backupMaster = masterThreads.get(0); 140 } 141 142 // Bring down the backup master 143 log("Stopping backup master\n\n"); 144 backupMaster.getMaster().stop("Stop of backup during rolling restart"); 145 cluster.hbaseCluster.waitOnMaster(backupMaster); 146 147 // Bring down the primary master 148 log("Stopping primary master\n\n"); 149 activeMaster.getMaster().stop("Stop of active during rolling restart"); 150 cluster.hbaseCluster.waitOnMaster(activeMaster); 151 152 // Start primary master 153 log("Restarting primary master\n\n"); 154 activeMaster = cluster.startMaster(); 155 cluster.waitForActiveAndReadyMaster(); 156 157 // Start backup master 158 log("Restarting backup master\n\n"); 159 backupMaster = cluster.startMaster(); 160 161 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size()); 162 163 // RegionServer Restarts 164 165 // Bring them down, one at a time, waiting between each to complete 166 List<RegionServerThread> regionServers = 167 cluster.getLiveRegionServerThreads(); 168 int num = 1; 169 int total = regionServers.size(); 170 for (RegionServerThread rst : regionServers) { 171 ServerName serverName = rst.getRegionServer().getServerName(); 172 log("Stopping region server " + num + " of " + total + " [ " + 173 serverName + "]"); 174 rst.getRegionServer().stop("Stopping RS during rolling restart"); 175 cluster.hbaseCluster.waitOnRegionServer(rst); 176 log("Waiting for RS shutdown to be handled by master"); 177 waitForRSShutdownToStartAndFinish(activeMaster, serverName); 178 log("RS shutdown done, waiting for no more RIT"); 179 TEST_UTIL.waitUntilNoRegionsInTransition(60000); 180 log("Verifying there are " + numRegions + " assigned on cluster"); 181 assertRegionsAssigned(cluster, regions); 182 expectedNumRS--; 183 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size()); 184 log("Restarting region server " + num + " of " + total); 185 restarted = cluster.startRegionServer(); 186 restarted.waitForServerOnline(); 187 expectedNumRS++; 188 log("Region server " + num + " is back online"); 189 log("Waiting for no more RIT"); 190 TEST_UTIL.waitUntilNoRegionsInTransition(60000); 191 log("Verifying there are " + numRegions + " assigned on cluster"); 192 assertRegionsAssigned(cluster, regions); 193 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size()); 194 num++; 195 } 196 Thread.sleep(1000); 197 assertRegionsAssigned(cluster, regions); 198 199 // TODO: Bring random 3 of 4 RS down at the same time 200 201 ht.close(); 202 // Stop the cluster 203 TEST_UTIL.shutdownMiniCluster(); 204 } 205 206 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster, 207 ServerName serverName) throws InterruptedException { 208 ServerManager sm = activeMaster.getMaster().getServerManager(); 209 // First wait for it to be in dead list 210 while (!sm.getDeadServers().isDeadServer(serverName)) { 211 log("Waiting for [" + serverName + "] to be listed as dead in master"); 212 Thread.sleep(1); 213 } 214 log("Server [" + serverName + "] marked as dead, waiting for it to " + 215 "finish dead processing"); 216 while (sm.areDeadServersInProgress()) { 217 log("Server [" + serverName + "] still being processed, waiting"); 218 Thread.sleep(100); 219 } 220 log("Server [" + serverName + "] done with server shutdown processing"); 221 } 222 223 private void log(String msg) { 224 LOG.debug("\n\nTRR: " + msg + "\n"); 225 } 226 227 private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) { 228 int numFound = 0; 229 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) { 230 numFound += rst.getRegionServer().getNumberOfOnlineRegions(); 231 } 232 for (MasterThread mt : cluster.getMasterThreads()) { 233 numFound += mt.getMaster().getNumberOfOnlineRegions(); 234 } 235 return numFound; 236 } 237 238 private void assertRegionsAssigned(MiniHBaseCluster cluster, 239 Set<String> expectedRegions) throws IOException { 240 int numFound = getNumberOfOnlineRegions(cluster); 241 if (expectedRegions.size() > numFound) { 242 log("Expected to find " + expectedRegions.size() + " but only found" 243 + " " + numFound); 244 NavigableSet<String> foundRegions = 245 HBaseTestingUtility.getAllOnlineRegions(cluster); 246 for (String region : expectedRegions) { 247 if (!foundRegions.contains(region)) { 248 log("Missing region: " + region); 249 } 250 } 251 assertEquals(expectedRegions.size(), numFound); 252 } else if (expectedRegions.size() < numFound) { 253 int doubled = numFound - expectedRegions.size(); 254 log("Expected to find " + expectedRegions.size() + " but found" 255 + " " + numFound + " (" + doubled + " double assignments?)"); 256 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster); 257 for (String region : doubleRegions) { 258 log("Region is double assigned: " + region); 259 } 260 assertEquals(expectedRegions.size(), numFound); 261 } else { 262 log("Success! Found expected number of " + numFound + " regions"); 263 } 264 } 265 266 private NavigableSet<String> getDoubleAssignedRegions( 267 MiniHBaseCluster cluster) throws IOException { 268 NavigableSet<String> online = new TreeSet<>(); 269 NavigableSet<String> doubled = new TreeSet<>(); 270 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) { 271 for (RegionInfo region : ProtobufUtil.getOnlineRegions( 272 rst.getRegionServer().getRSRpcServices())) { 273 if(!online.add(region.getRegionNameAsString())) { 274 doubled.add(region.getRegionNameAsString()); 275 } 276 } 277 } 278 return doubled; 279 } 280 281 282} 283