001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import static org.junit.jupiter.api.Assertions.assertEquals; 021import static org.junit.jupiter.api.Assertions.assertFalse; 022import static org.junit.jupiter.api.Assertions.assertNotEquals; 023import static org.junit.jupiter.api.Assertions.assertNotNull; 024import static org.junit.jupiter.api.Assertions.assertTrue; 025 026import java.io.IOException; 027import java.util.List; 028import java.util.NoSuchElementException; 029import java.util.Objects; 030import java.util.concurrent.TimeUnit; 031import java.util.stream.Stream; 032import org.apache.hadoop.hbase.CatalogFamilyFormat; 033import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate; 034import org.apache.hadoop.hbase.HBaseServerBase; 035import org.apache.hadoop.hbase.HBaseTestingUtil; 036import org.apache.hadoop.hbase.HConstants; 037import org.apache.hadoop.hbase.MetaTableAccessor; 038import org.apache.hadoop.hbase.ServerName; 039import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 040import org.apache.hadoop.hbase.TableName; 041import org.apache.hadoop.hbase.TableNameTestExtension; 042import org.apache.hadoop.hbase.client.RegionInfo; 043import org.apache.hadoop.hbase.client.Result; 044import org.apache.hadoop.hbase.client.Table; 045import org.apache.hadoop.hbase.master.HMaster; 046import org.apache.hadoop.hbase.master.RegionState; 047import org.apache.hadoop.hbase.procedure2.Procedure; 048import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; 049import org.apache.hadoop.hbase.regionserver.HRegionServer; 050import org.apache.hadoop.hbase.testclassification.LargeTests; 051import org.apache.hadoop.hbase.testclassification.MasterTests; 052import org.apache.hadoop.hbase.util.Bytes; 053import org.apache.hadoop.hbase.util.JVMClusterUtil; 054import org.apache.hadoop.hbase.util.Pair; 055import org.junit.jupiter.api.Tag; 056import org.junit.jupiter.api.TestTemplate; 057import org.junit.jupiter.api.extension.RegisterExtension; 058import org.junit.jupiter.params.provider.Arguments; 059import org.slf4j.Logger; 060import org.slf4j.LoggerFactory; 061 062import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; 063 064import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 065import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; 066 067/** 068 * Test of the HBCK-version of SCP. The HBCKSCP is an SCP only it reads hbase:meta for list of 069 * Regions that were on the server-to-process rather than consult Master in-memory-state. 070 */ 071@Tag(MasterTests.TAG) 072@Tag(LargeTests.TAG) 073@HBaseParameterizedTestTemplate(name = "replicas:{0} scheduler:{1} selector:{2}") 074public class TestHBCKSCP extends TestSCPBase { 075 private static final Logger LOG = LoggerFactory.getLogger(TestHBCKSCP.class); 076 077 @RegisterExtension 078 public TableNameTestExtension tableNameTestExtension = new TableNameTestExtension(); 079 080 private final int replicas; 081 private final HBCKSCPScheduler hbckscpScheduler; 082 private final RegionSelector regionSelector; 083 084 public TestHBCKSCP(final int replicas, final HBCKSCPScheduler hbckscpScheduler, 085 final RegionSelector regionSelector) { 086 this.replicas = replicas; 087 this.hbckscpScheduler = hbckscpScheduler; 088 this.regionSelector = regionSelector; 089 } 090 091 public static Stream<Arguments> parameters() { 092 return Stream.of( 093 Arguments.of(1, new ScheduleServerCrashProcedure(), new PrimaryNotMetaRegionSelector()), 094 Arguments.of(3, new ScheduleServerCrashProcedure(), new ReplicaNonMetaRegionSelector()), 095 Arguments.of(1, new ScheduleSCPsForUnknownServers(), new PrimaryNotMetaRegionSelector()), 096 Arguments.of(3, new ScheduleSCPsForUnknownServers(), new ReplicaNonMetaRegionSelector())); 097 } 098 099 @TestTemplate 100 public void test() throws Exception { 101 // we are about to do one for it? 102 SingleProcessHBaseCluster cluster = this.util.getHBaseCluster(); 103 104 // Assert that we have three RegionServers. Test depends on there being multiple. 105 assertEquals(RS_COUNT, cluster.getLiveRegionServerThreads().size()); 106 107 int count; 108 try (Table table = createTable(tableNameTestExtension.getTableName())) { 109 // Load the table with a bit of data so some logs to split and some edits in each region. 110 this.util.loadTable(table, HBaseTestingUtil.COLUMNS[0]); 111 count = HBaseTestingUtil.countRows(table); 112 } 113 assertTrue(count > 0, "expected some rows"); 114 115 // Make the test easier by not working on server hosting meta... 116 // Find another RS. Purge it from Master memory w/o running SCP (if 117 // SCP runs, it will clear entries from hbase:meta which frustrates 118 // our attempt at manufacturing 'Unknown Servers' condition). 119 final ServerName metaServer = util.getMiniHBaseCluster().getServerHoldingMeta(); 120 final ServerName rsServerName = cluster.getRegionServerThreads().stream() 121 .map(JVMClusterUtil.RegionServerThread::getRegionServer).map(HBaseServerBase::getServerName) 122 .filter(sn -> !sn.equals(metaServer)).findAny().orElseThrow(() -> new NoSuchElementException( 123 "Cannot locate a region server that is not hosting meta.")); 124 HMaster master = cluster.getMaster(); 125 // Get a Region that is on the server. 126 final List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(rsServerName); 127 LOG.debug("{} is holding {} regions.", rsServerName, regions.size()); 128 final RegionInfo rsRI = 129 regions.stream().peek(info -> LOG.debug("{}", info)).filter(regionSelector::regionFilter) 130 .findAny().orElseThrow(regionSelector::regionFilterFailure); 131 final int replicaId = rsRI.getReplicaId(); 132 Result r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 133 // Assert region is OPEN. 134 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 135 r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId)))); 136 ServerName serverName = CatalogFamilyFormat.getServerName(r, replicaId); 137 assertEquals(rsServerName, serverName); 138 // moveFrom adds to dead servers and adds it to processing list only we will 139 // not be processing this server 'normally'. Remove it from processing by 140 // calling 'finish' and then remove it from dead servers so rsServerName 141 // becomes an 'Unknown Server' even though it is still around. 142 LOG.info("Killing {}", rsServerName); 143 cluster.killRegionServer(rsServerName); 144 145 master.getServerManager().moveFromOnlineToDeadServers(rsServerName); 146 master.getServerManager().getDeadServers().removeDeadServer(rsServerName); 147 master.getAssignmentManager().getRegionStates().removeServer(rsServerName); 148 // Kill the server. Nothing should happen since an 'Unknown Server' as far 149 // as the Master is concerned; i.e. no SCP. 150 HRegionServer hrs = cluster.getRegionServer(rsServerName); 151 util.waitFor(TimeUnit.MINUTES.toMillis(1), hrs::isStopped); 152 LOG.info("Dead {}", rsServerName); 153 // Now assert still references in hbase:meta to the 'dead' server -- they haven't been 154 // cleaned up by an SCP or by anything else. 155 assertTrue(searchMeta(master, rsServerName)); 156 // Assert region is OPEN on dead server still. 157 r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 158 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 159 r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId)))); 160 serverName = CatalogFamilyFormat.getServerName(r, replicaId); 161 assertNotNull(cluster.getRegionServer(serverName)); 162 assertEquals(rsServerName, serverName); 163 164 // I now have 'Unknown Server' references in hbase:meta; i.e. Server references 165 // with no corresponding SCP. Queue one. 166 long pid = scheduleHBCKSCP(rsServerName, master); 167 assertNotEquals(Procedure.NO_PROC_ID, pid); 168 ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), pid); 169 // After SCP, assert region is OPEN on new server. 170 r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 171 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 172 r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId)))); 173 serverName = CatalogFamilyFormat.getServerName(r, 0); 174 assertNotNull(cluster.getRegionServer(serverName)); 175 assertNotEquals(rsServerName, serverName); 176 // Make sure no mention of old server post SCP. 177 assertFalse(searchMeta(master, rsServerName)); 178 } 179 180 protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 181 return hbckscpScheduler.scheduleHBCKSCP(rsServerName, master); 182 } 183 184 @Override 185 protected int getRegionReplication() { 186 return replicas; 187 } 188 189 /** Returns True if we find reference to <code>sn</code> in meta table. */ 190 private boolean searchMeta(HMaster master, ServerName sn) throws IOException { 191 List<Pair<RegionInfo, ServerName>> ps = 192 MetaTableAccessor.getTableRegionsAndLocations(master.getConnection(), null); 193 for (Pair<RegionInfo, ServerName> p : ps) { 194 if (p.getSecond().equals(sn)) { 195 return true; 196 } 197 } 198 return false; 199 } 200 201 /** 202 * Encapsulates the choice of which HBCK2 method to call. 203 */ 204 private abstract static class HBCKSCPScheduler { 205 abstract long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException; 206 207 @Override 208 public String toString() { 209 return this.getClass().getSimpleName(); 210 } 211 } 212 213 /** 214 * Invokes {@code MasterRpcServices#scheduleServerCrashProcedure}. 215 */ 216 private static class ScheduleServerCrashProcedure extends HBCKSCPScheduler { 217 @Override 218 public long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 219 MasterProtos.ScheduleServerCrashProcedureResponse response = master.getMasterRpcServices() 220 .scheduleServerCrashProcedure(null, MasterProtos.ScheduleServerCrashProcedureRequest 221 .newBuilder().addServerName(ProtobufUtil.toServerName(rsServerName)).build()); 222 assertEquals(1, response.getPidCount()); 223 return response.getPid(0); 224 } 225 } 226 227 /** 228 * Invokes {@code MasterRpcServices#scheduleSCPsForUnknownServers}. 229 */ 230 private static class ScheduleSCPsForUnknownServers extends HBCKSCPScheduler { 231 @Override 232 long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 233 MasterProtos.ScheduleSCPsForUnknownServersResponse response = 234 master.getMasterRpcServices().scheduleSCPsForUnknownServers(null, 235 MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build()); 236 assertEquals(1, response.getPidCount()); 237 return response.getPid(0); 238 } 239 } 240 241 /** 242 * Encapsulates how the target region is selected. 243 */ 244 private static abstract class RegionSelector { 245 abstract boolean regionFilter(RegionInfo info); 246 247 abstract Exception regionFilterFailure(); 248 249 @Override 250 public String toString() { 251 return this.getClass().getSimpleName(); 252 } 253 } 254 255 /** 256 * Selects a non-meta region that is also a primary region. 257 */ 258 private static class PrimaryNotMetaRegionSelector extends RegionSelector { 259 @Override 260 boolean regionFilter(final RegionInfo info) { 261 return !Objects.equals(TableName.META_TABLE_NAME, info.getTable()) 262 && Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId()); 263 } 264 265 @Override 266 Exception regionFilterFailure() { 267 return new NoSuchElementException("Cannot locate a primary, non-meta region."); 268 } 269 } 270 271 /** 272 * Selects a non-meta region that is also a replica region. 273 */ 274 private static class ReplicaNonMetaRegionSelector extends RegionSelector { 275 @Override 276 boolean regionFilter(RegionInfo info) { 277 return !Objects.equals(TableName.META_TABLE_NAME, info.getTable()) 278 && !Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId()); 279 } 280 281 @Override 282 Exception regionFilterFailure() { 283 return new NoSuchElementException("Cannot locate a replica, non-meta region."); 284 } 285 } 286}