001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertTrue; 022 023import java.io.IOException; 024import java.util.List; 025import java.util.Map; 026import java.util.concurrent.atomic.AtomicBoolean; 027import java.util.concurrent.atomic.AtomicReference; 028import org.apache.hadoop.conf.Configuration; 029import org.apache.hadoop.hbase.HBaseClassTestRule; 030import org.apache.hadoop.hbase.HBaseConfiguration; 031import org.apache.hadoop.hbase.HBaseTestingUtility; 032import org.apache.hadoop.hbase.LocalHBaseCluster; 033import org.apache.hadoop.hbase.MiniHBaseCluster; 034import org.apache.hadoop.hbase.ServerName; 035import org.apache.hadoop.hbase.client.RegionInfo; 036import org.apache.hadoop.hbase.master.HMaster; 037import org.apache.hadoop.hbase.master.LoadBalancer; 038import org.apache.hadoop.hbase.master.ServerListener; 039import org.apache.hadoop.hbase.master.ServerManager; 040import org.apache.hadoop.hbase.testclassification.MediumTests; 041import org.apache.hadoop.hbase.testclassification.RegionServerTests; 042import org.apache.hadoop.hbase.util.Bytes; 043import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 044import org.apache.hadoop.hbase.util.Threads; 045import org.junit.ClassRule; 046import org.junit.Ignore; 047import org.junit.Rule; 048import org.junit.Test; 049import org.junit.experimental.categories.Category; 050import org.junit.rules.TestName; 051import org.slf4j.Logger; 052import org.slf4j.LoggerFactory; 053 054import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse; 055 056/** 057 * Tests that a regionserver that dies after reporting for duty gets removed 058 * from list of online regions. See HBASE-9593. 059 */ 060@Category({RegionServerTests.class, MediumTests.class}) 061@Ignore("See HBASE-19515") 062public class TestRSKilledWhenInitializing { 063 064 @ClassRule 065 public static final HBaseClassTestRule CLASS_RULE = 066 HBaseClassTestRule.forClass(TestRSKilledWhenInitializing.class); 067 068 private static final Logger LOG = LoggerFactory.getLogger(TestRSKilledWhenInitializing.class); 069 070 @Rule 071 public TestName testName = new TestName(); 072 073 // This boolean needs to be globally available. It is used below in our 074 // mocked up regionserver so it knows when to die. 075 private static AtomicBoolean masterActive = new AtomicBoolean(false); 076 // Ditto for this variable. It also is used in the mocked regionserver class. 077 private static final AtomicReference<ServerName> killedRS = new AtomicReference<ServerName>(); 078 079 private static final int NUM_MASTERS = 1; 080 private static final int NUM_RS = 2; 081 082 /** 083 * Test verifies whether a region server is removed from online servers list in master if it went 084 * down after registering with master. Test will TIMEOUT if an error!!!! 085 * @throws Exception 086 */ 087 @Test 088 public void testRSTerminationAfterRegisteringToMasterBeforeCreatingEphemeralNode() 089 throws Exception { 090 // Create config to use for this cluster 091 Configuration conf = HBaseConfiguration.create(); 092 conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); 093 // Start the cluster 094 final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); 095 TEST_UTIL.startMiniDFSCluster(3); 096 TEST_UTIL.startMiniZKCluster(); 097 TEST_UTIL.createRootDir(); 098 final LocalHBaseCluster cluster = 099 new LocalHBaseCluster(conf, NUM_MASTERS, NUM_RS, HMaster.class, 100 RegisterAndDieRegionServer.class); 101 final MasterThread master = startMaster(cluster.getMasters().get(0)); 102 try { 103 // Master is up waiting on RegionServers to check in. Now start RegionServers. 104 for (int i = 0; i < NUM_RS; i++) { 105 cluster.getRegionServers().get(i).start(); 106 } 107 // Expected total regionservers depends on whether Master can host regions or not. 108 int expectedTotalRegionServers = NUM_RS + (LoadBalancer.isTablesOnMaster(conf)? 1: 0); 109 List<ServerName> onlineServersList = null; 110 do { 111 onlineServersList = master.getMaster().getServerManager().getOnlineServersList(); 112 } while (onlineServersList.size() < expectedTotalRegionServers); 113 // Wait until killedRS is set. Means RegionServer is starting to go down. 114 while (killedRS.get() == null) { 115 Threads.sleep(1); 116 } 117 // Wait on the RegionServer to fully die. 118 while (cluster.getLiveRegionServers().size() >= expectedTotalRegionServers) { 119 Threads.sleep(1); 120 } 121 // Make sure Master is fully up before progressing. Could take a while if regions 122 // being reassigned. 123 while (!master.getMaster().isInitialized()) { 124 Threads.sleep(1); 125 } 126 127 // Now in steady state. How many regions open? Master should have too many regionservers 128 // showing still. The downed RegionServer should still be showing as registered. 129 assertTrue(master.getMaster().getServerManager().isServerOnline(killedRS.get())); 130 // Find non-meta region (namespace?) and assign to the killed server. That'll trigger cleanup. 131 Map<RegionInfo, ServerName> assignments = null; 132 do { 133 assignments = master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments(); 134 } while (assignments == null || assignments.size() < 2); 135 RegionInfo hri = null; 136 for (Map.Entry<RegionInfo, ServerName> e: assignments.entrySet()) { 137 if (e.getKey().isMetaRegion()) continue; 138 hri = e.getKey(); 139 break; 140 } 141 // Try moving region to the killed server. It will fail. As by-product, we will 142 // remove the RS from Master online list because no corresponding znode. 143 assertEquals(expectedTotalRegionServers, 144 master.getMaster().getServerManager().getOnlineServersList().size()); 145 LOG.info("Move " + hri.getEncodedName() + " to " + killedRS.get()); 146 master.getMaster().move(hri.getEncodedNameAsBytes(), 147 Bytes.toBytes(killedRS.get().toString())); 148 149 // TODO: This test could do more to verify fix. It could create a table 150 // and do round-robin assign. It should fail if zombie RS. HBASE-19515. 151 152 // Wait until the RS no longer shows as registered in Master. 153 while (onlineServersList.size() > (NUM_RS + 1)) { 154 Thread.sleep(100); 155 onlineServersList = master.getMaster().getServerManager().getOnlineServersList(); 156 } 157 } finally { 158 // Shutdown is messy with complaints about fs being closed. Why? TODO. 159 cluster.shutdown(); 160 cluster.join(); 161 TEST_UTIL.shutdownMiniDFSCluster(); 162 TEST_UTIL.shutdownMiniZKCluster(); 163 TEST_UTIL.cleanupTestDir(); 164 } 165 } 166 167 /** 168 * Start Master. Get as far as the state where Master is waiting on 169 * RegionServers to check in, then return. 170 */ 171 private MasterThread startMaster(MasterThread master) { 172 master.start(); 173 // It takes a while until ServerManager creation to happen inside Master startup. 174 while (master.getMaster().getServerManager() == null) { 175 continue; 176 } 177 // Set a listener for the waiting-on-RegionServers state. We want to wait 178 // until this condition before we leave this method and start regionservers. 179 final AtomicBoolean waiting = new AtomicBoolean(false); 180 if (master.getMaster().getServerManager() == null) throw new NullPointerException("SM"); 181 master.getMaster().getServerManager().registerListener(new ServerListener() { 182 @Override 183 public void waiting() { 184 waiting.set(true); 185 } 186 }); 187 // Wait until the Master gets to place where it is waiting on RegionServers to check in. 188 while (!waiting.get()) { 189 continue; 190 } 191 // Set the global master-is-active; gets picked up by regionservers later. 192 masterActive.set(true); 193 return master; 194 } 195 196 /** 197 * A RegionServer that reports for duty and then immediately dies if it is the first to receive 198 * the response to a reportForDuty. When it dies, it clears its ephemeral znode which the master 199 * notices and so removes the region from its set of online regionservers. 200 */ 201 static class RegisterAndDieRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer { 202 public RegisterAndDieRegionServer(Configuration conf) 203 throws IOException, InterruptedException { 204 super(conf); 205 } 206 207 @Override 208 protected void handleReportForDutyResponse(RegionServerStartupResponse c) 209 throws IOException { 210 if (killedRS.compareAndSet(null, getServerName())) { 211 // Make sure Master is up so it will see the removal of the ephemeral znode for this RS. 212 while (!masterActive.get()) { 213 Threads.sleep(100); 214 } 215 super.kill(); 216 } else { 217 super.handleReportForDutyResponse(c); 218 } 219 } 220 } 221}