001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import static junit.framework.TestCase.assertFalse; 021import static junit.framework.TestCase.assertNotNull; 022import static org.junit.Assert.assertEquals; 023import static org.junit.Assert.assertNotEquals; 024import static org.junit.Assert.assertTrue; 025 026import java.io.IOException; 027import java.util.List; 028import java.util.NoSuchElementException; 029import java.util.Objects; 030import java.util.concurrent.TimeUnit; 031import org.apache.hadoop.hbase.CatalogFamilyFormat; 032import org.apache.hadoop.hbase.HBaseClassTestRule; 033import org.apache.hadoop.hbase.HBaseServerBase; 034import org.apache.hadoop.hbase.HBaseTestingUtil; 035import org.apache.hadoop.hbase.HConstants; 036import org.apache.hadoop.hbase.MetaTableAccessor; 037import org.apache.hadoop.hbase.ServerName; 038import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 039import org.apache.hadoop.hbase.TableName; 040import org.apache.hadoop.hbase.TableNameTestRule; 041import org.apache.hadoop.hbase.client.RegionInfo; 042import org.apache.hadoop.hbase.client.Result; 043import org.apache.hadoop.hbase.client.Table; 044import org.apache.hadoop.hbase.master.HMaster; 045import org.apache.hadoop.hbase.master.RegionState; 046import org.apache.hadoop.hbase.procedure2.Procedure; 047import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; 048import org.apache.hadoop.hbase.regionserver.HRegionServer; 049import org.apache.hadoop.hbase.testclassification.LargeTests; 050import org.apache.hadoop.hbase.testclassification.MasterTests; 051import org.apache.hadoop.hbase.util.Bytes; 052import org.apache.hadoop.hbase.util.JVMClusterUtil; 053import org.apache.hadoop.hbase.util.Pair; 054import org.junit.ClassRule; 055import org.junit.Rule; 056import org.junit.Test; 057import org.junit.experimental.categories.Category; 058import org.junit.runner.RunWith; 059import org.junit.runners.Parameterized; 060import org.slf4j.Logger; 061import org.slf4j.LoggerFactory; 062 063import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; 064 065import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 066import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; 067 068/** 069 * Test of the HBCK-version of SCP. The HBCKSCP is an SCP only it reads hbase:meta for list of 070 * Regions that were on the server-to-process rather than consult Master in-memory-state. 071 */ 072@Category({ MasterTests.class, LargeTests.class }) 073@RunWith(Parameterized.class) 074public class TestHBCKSCP extends TestSCPBase { 075 private static final Logger LOG = LoggerFactory.getLogger(TestHBCKSCP.class); 076 077 @ClassRule 078 public static final HBaseClassTestRule CLASS_RULE = 079 HBaseClassTestRule.forClass(TestHBCKSCP.class); 080 @Rule 081 public TableNameTestRule tableNameTestRule = new TableNameTestRule(); 082 083 private final int replicas; 084 private final HBCKSCPScheduler hbckscpScheduler; 085 private final RegionSelector regionSelector; 086 087 public TestHBCKSCP(final int replicas, final HBCKSCPScheduler hbckscpScheduler, 088 final RegionSelector regionSelector) { 089 this.replicas = replicas; 090 this.hbckscpScheduler = hbckscpScheduler; 091 this.regionSelector = regionSelector; 092 } 093 094 @Parameterized.Parameters(name = "replicas:{0} scheduler:{1} selector:{2}") 095 public static Object[][] params() { 096 return new Object[][] { 097 { 1, new ScheduleServerCrashProcedure(), new PrimaryNotMetaRegionSelector() }, 098 { 3, new ScheduleServerCrashProcedure(), new ReplicaNonMetaRegionSelector() }, 099 { 1, new ScheduleSCPsForUnknownServers(), new PrimaryNotMetaRegionSelector() }, 100 { 3, new ScheduleSCPsForUnknownServers(), new ReplicaNonMetaRegionSelector() } }; 101 } 102 103 @Test 104 public void test() throws Exception { 105 // we are about to do one for it? 106 SingleProcessHBaseCluster cluster = this.util.getHBaseCluster(); 107 108 // Assert that we have three RegionServers. Test depends on there being multiple. 109 assertEquals(RS_COUNT, cluster.getLiveRegionServerThreads().size()); 110 111 int count; 112 try (Table table = createTable(tableNameTestRule.getTableName())) { 113 // Load the table with a bit of data so some logs to split and some edits in each region. 114 this.util.loadTable(table, HBaseTestingUtil.COLUMNS[0]); 115 count = HBaseTestingUtil.countRows(table); 116 } 117 assertTrue("expected some rows", count > 0); 118 119 // Make the test easier by not working on server hosting meta... 120 // Find another RS. Purge it from Master memory w/o running SCP (if 121 // SCP runs, it will clear entries from hbase:meta which frustrates 122 // our attempt at manufacturing 'Unknown Servers' condition). 123 final ServerName metaServer = util.getMiniHBaseCluster().getServerHoldingMeta(); 124 final ServerName rsServerName = cluster.getRegionServerThreads().stream() 125 .map(JVMClusterUtil.RegionServerThread::getRegionServer).map(HBaseServerBase::getServerName) 126 .filter(sn -> !sn.equals(metaServer)).findAny().orElseThrow(() -> new NoSuchElementException( 127 "Cannot locate a region server that is not hosting meta.")); 128 HMaster master = cluster.getMaster(); 129 // Get a Region that is on the server. 130 final List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(rsServerName); 131 LOG.debug("{} is holding {} regions.", rsServerName, regions.size()); 132 final RegionInfo rsRI = 133 regions.stream().peek(info -> LOG.debug("{}", info)).filter(regionSelector::regionFilter) 134 .findAny().orElseThrow(regionSelector::regionFilterFailure); 135 final int replicaId = rsRI.getReplicaId(); 136 Result r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 137 // Assert region is OPEN. 138 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 139 r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId)))); 140 ServerName serverName = CatalogFamilyFormat.getServerName(r, replicaId); 141 assertEquals(rsServerName, serverName); 142 // moveFrom adds to dead servers and adds it to processing list only we will 143 // not be processing this server 'normally'. Remove it from processing by 144 // calling 'finish' and then remove it from dead servers so rsServerName 145 // becomes an 'Unknown Server' even though it is still around. 146 LOG.info("Killing {}", rsServerName); 147 cluster.killRegionServer(rsServerName); 148 149 master.getServerManager().moveFromOnlineToDeadServers(rsServerName); 150 master.getServerManager().getDeadServers().removeDeadServer(rsServerName); 151 master.getAssignmentManager().getRegionStates().removeServer(rsServerName); 152 // Kill the server. Nothing should happen since an 'Unknown Server' as far 153 // as the Master is concerned; i.e. no SCP. 154 HRegionServer hrs = cluster.getRegionServer(rsServerName); 155 util.waitFor(TimeUnit.MINUTES.toMillis(1), hrs::isStopped); 156 LOG.info("Dead {}", rsServerName); 157 // Now assert still references in hbase:meta to the 'dead' server -- they haven't been 158 // cleaned up by an SCP or by anything else. 159 assertTrue(searchMeta(master, rsServerName)); 160 // Assert region is OPEN on dead server still. 161 r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 162 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 163 r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId)))); 164 serverName = CatalogFamilyFormat.getServerName(r, replicaId); 165 assertNotNull(cluster.getRegionServer(serverName)); 166 assertEquals(rsServerName, serverName); 167 168 // I now have 'Unknown Server' references in hbase:meta; i.e. Server references 169 // with no corresponding SCP. Queue one. 170 long pid = scheduleHBCKSCP(rsServerName, master); 171 assertNotEquals(Procedure.NO_PROC_ID, pid); 172 ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), pid); 173 // After SCP, assert region is OPEN on new server. 174 r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 175 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 176 r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId)))); 177 serverName = CatalogFamilyFormat.getServerName(r, 0); 178 assertNotNull(cluster.getRegionServer(serverName)); 179 assertNotEquals(rsServerName, serverName); 180 // Make sure no mention of old server post SCP. 181 assertFalse(searchMeta(master, rsServerName)); 182 } 183 184 protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 185 return hbckscpScheduler.scheduleHBCKSCP(rsServerName, master); 186 } 187 188 @Override 189 protected int getRegionReplication() { 190 return replicas; 191 } 192 193 /** Returns True if we find reference to <code>sn</code> in meta table. */ 194 private boolean searchMeta(HMaster master, ServerName sn) throws IOException { 195 List<Pair<RegionInfo, ServerName>> ps = 196 MetaTableAccessor.getTableRegionsAndLocations(master.getConnection(), null); 197 for (Pair<RegionInfo, ServerName> p : ps) { 198 if (p.getSecond().equals(sn)) { 199 return true; 200 } 201 } 202 return false; 203 } 204 205 /** 206 * Encapsulates the choice of which HBCK2 method to call. 207 */ 208 private abstract static class HBCKSCPScheduler { 209 abstract long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException; 210 211 @Override 212 public String toString() { 213 return this.getClass().getSimpleName(); 214 } 215 } 216 217 /** 218 * Invokes {@code MasterRpcServices#scheduleServerCrashProcedure}. 219 */ 220 private static class ScheduleServerCrashProcedure extends HBCKSCPScheduler { 221 @Override 222 public long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 223 MasterProtos.ScheduleServerCrashProcedureResponse response = master.getMasterRpcServices() 224 .scheduleServerCrashProcedure(null, MasterProtos.ScheduleServerCrashProcedureRequest 225 .newBuilder().addServerName(ProtobufUtil.toServerName(rsServerName)).build()); 226 assertEquals(1, response.getPidCount()); 227 return response.getPid(0); 228 } 229 } 230 231 /** 232 * Invokes {@code MasterRpcServices#scheduleSCPsForUnknownServers}. 233 */ 234 private static class ScheduleSCPsForUnknownServers extends HBCKSCPScheduler { 235 @Override 236 long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 237 MasterProtos.ScheduleSCPsForUnknownServersResponse response = 238 master.getMasterRpcServices().scheduleSCPsForUnknownServers(null, 239 MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build()); 240 assertEquals(1, response.getPidCount()); 241 return response.getPid(0); 242 } 243 } 244 245 /** 246 * Encapsulates how the target region is selected. 247 */ 248 private static abstract class RegionSelector { 249 abstract boolean regionFilter(RegionInfo info); 250 251 abstract Exception regionFilterFailure(); 252 253 @Override 254 public String toString() { 255 return this.getClass().getSimpleName(); 256 } 257 } 258 259 /** 260 * Selects a non-meta region that is also a primary region. 261 */ 262 private static class PrimaryNotMetaRegionSelector extends RegionSelector { 263 @Override 264 boolean regionFilter(final RegionInfo info) { 265 return !Objects.equals(TableName.META_TABLE_NAME, info.getTable()) 266 && Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId()); 267 } 268 269 @Override 270 Exception regionFilterFailure() { 271 return new NoSuchElementException("Cannot locate a primary, non-meta region."); 272 } 273 } 274 275 /** 276 * Selects a non-meta region that is also a replica region. 277 */ 278 private static class ReplicaNonMetaRegionSelector extends RegionSelector { 279 @Override 280 boolean regionFilter(RegionInfo info) { 281 return !Objects.equals(TableName.META_TABLE_NAME, info.getTable()) 282 && !Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId()); 283 } 284 285 @Override 286 Exception regionFilterFailure() { 287 return new NoSuchElementException("Cannot locate a replica, non-meta region."); 288 } 289 } 290}