001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import static org.junit.jupiter.api.Assertions.assertEquals;
021import static org.junit.jupiter.api.Assertions.assertFalse;
022import static org.junit.jupiter.api.Assertions.assertNotEquals;
023import static org.junit.jupiter.api.Assertions.assertNotNull;
024import static org.junit.jupiter.api.Assertions.assertTrue;
025
026import java.io.IOException;
027import java.util.List;
028import java.util.NoSuchElementException;
029import java.util.Objects;
030import java.util.concurrent.TimeUnit;
031import java.util.stream.Stream;
032import org.apache.hadoop.hbase.CatalogFamilyFormat;
033import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate;
034import org.apache.hadoop.hbase.HBaseServerBase;
035import org.apache.hadoop.hbase.HBaseTestingUtil;
036import org.apache.hadoop.hbase.HConstants;
037import org.apache.hadoop.hbase.MetaTableAccessor;
038import org.apache.hadoop.hbase.ServerName;
039import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
040import org.apache.hadoop.hbase.TableName;
041import org.apache.hadoop.hbase.TableNameTestExtension;
042import org.apache.hadoop.hbase.client.RegionInfo;
043import org.apache.hadoop.hbase.client.Result;
044import org.apache.hadoop.hbase.client.Table;
045import org.apache.hadoop.hbase.master.HMaster;
046import org.apache.hadoop.hbase.master.RegionState;
047import org.apache.hadoop.hbase.procedure2.Procedure;
048import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
049import org.apache.hadoop.hbase.regionserver.HRegionServer;
050import org.apache.hadoop.hbase.testclassification.LargeTests;
051import org.apache.hadoop.hbase.testclassification.MasterTests;
052import org.apache.hadoop.hbase.util.Bytes;
053import org.apache.hadoop.hbase.util.JVMClusterUtil;
054import org.apache.hadoop.hbase.util.Pair;
055import org.junit.jupiter.api.Tag;
056import org.junit.jupiter.api.TestTemplate;
057import org.junit.jupiter.api.extension.RegisterExtension;
058import org.junit.jupiter.params.provider.Arguments;
059import org.slf4j.Logger;
060import org.slf4j.LoggerFactory;
061
062import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
063
064import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
065import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
066
067/**
068 * Test of the HBCK-version of SCP. The HBCKSCP is an SCP only it reads hbase:meta for list of
069 * Regions that were on the server-to-process rather than consult Master in-memory-state.
070 */
071@Tag(MasterTests.TAG)
072@Tag(LargeTests.TAG)
073@HBaseParameterizedTestTemplate(name = "replicas:{0} scheduler:{1} selector:{2}")
074public class TestHBCKSCP extends TestSCPBase {
075  private static final Logger LOG = LoggerFactory.getLogger(TestHBCKSCP.class);
076
077  @RegisterExtension
078  public TableNameTestExtension tableNameTestExtension = new TableNameTestExtension();
079
080  private final int replicas;
081  private final HBCKSCPScheduler hbckscpScheduler;
082  private final RegionSelector regionSelector;
083
084  public TestHBCKSCP(final int replicas, final HBCKSCPScheduler hbckscpScheduler,
085    final RegionSelector regionSelector) {
086    this.replicas = replicas;
087    this.hbckscpScheduler = hbckscpScheduler;
088    this.regionSelector = regionSelector;
089  }
090
091  public static Stream<Arguments> parameters() {
092    return Stream.of(
093      Arguments.of(1, new ScheduleServerCrashProcedure(), new PrimaryNotMetaRegionSelector()),
094      Arguments.of(3, new ScheduleServerCrashProcedure(), new ReplicaNonMetaRegionSelector()),
095      Arguments.of(1, new ScheduleSCPsForUnknownServers(), new PrimaryNotMetaRegionSelector()),
096      Arguments.of(3, new ScheduleSCPsForUnknownServers(), new ReplicaNonMetaRegionSelector()));
097  }
098
099  @TestTemplate
100  public void test() throws Exception {
101    // we are about to do one for it?
102    SingleProcessHBaseCluster cluster = this.util.getHBaseCluster();
103
104    // Assert that we have three RegionServers. Test depends on there being multiple.
105    assertEquals(RS_COUNT, cluster.getLiveRegionServerThreads().size());
106
107    int count;
108    try (Table table = createTable(tableNameTestExtension.getTableName())) {
109      // Load the table with a bit of data so some logs to split and some edits in each region.
110      this.util.loadTable(table, HBaseTestingUtil.COLUMNS[0]);
111      count = HBaseTestingUtil.countRows(table);
112    }
113    assertTrue(count > 0, "expected some rows");
114
115    // Make the test easier by not working on server hosting meta...
116    // Find another RS. Purge it from Master memory w/o running SCP (if
117    // SCP runs, it will clear entries from hbase:meta which frustrates
118    // our attempt at manufacturing 'Unknown Servers' condition).
119    final ServerName metaServer = util.getMiniHBaseCluster().getServerHoldingMeta();
120    final ServerName rsServerName = cluster.getRegionServerThreads().stream()
121      .map(JVMClusterUtil.RegionServerThread::getRegionServer).map(HBaseServerBase::getServerName)
122      .filter(sn -> !sn.equals(metaServer)).findAny().orElseThrow(() -> new NoSuchElementException(
123        "Cannot locate a region server that is not hosting meta."));
124    HMaster master = cluster.getMaster();
125    // Get a Region that is on the server.
126    final List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(rsServerName);
127    LOG.debug("{} is holding {} regions.", rsServerName, regions.size());
128    final RegionInfo rsRI =
129      regions.stream().peek(info -> LOG.debug("{}", info)).filter(regionSelector::regionFilter)
130        .findAny().orElseThrow(regionSelector::regionFilterFailure);
131    final int replicaId = rsRI.getReplicaId();
132    Result r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
133    // Assert region is OPEN.
134    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
135      r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId))));
136    ServerName serverName = CatalogFamilyFormat.getServerName(r, replicaId);
137    assertEquals(rsServerName, serverName);
138    // moveFrom adds to dead servers and adds it to processing list only we will
139    // not be processing this server 'normally'. Remove it from processing by
140    // calling 'finish' and then remove it from dead servers so rsServerName
141    // becomes an 'Unknown Server' even though it is still around.
142    LOG.info("Killing {}", rsServerName);
143    cluster.killRegionServer(rsServerName);
144
145    master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
146    master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
147    master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
148    // Kill the server. Nothing should happen since an 'Unknown Server' as far
149    // as the Master is concerned; i.e. no SCP.
150    HRegionServer hrs = cluster.getRegionServer(rsServerName);
151    util.waitFor(TimeUnit.MINUTES.toMillis(1), hrs::isStopped);
152    LOG.info("Dead {}", rsServerName);
153    // Now assert still references in hbase:meta to the 'dead' server -- they haven't been
154    // cleaned up by an SCP or by anything else.
155    assertTrue(searchMeta(master, rsServerName));
156    // Assert region is OPEN on dead server still.
157    r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
158    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
159      r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId))));
160    serverName = CatalogFamilyFormat.getServerName(r, replicaId);
161    assertNotNull(cluster.getRegionServer(serverName));
162    assertEquals(rsServerName, serverName);
163
164    // I now have 'Unknown Server' references in hbase:meta; i.e. Server references
165    // with no corresponding SCP. Queue one.
166    long pid = scheduleHBCKSCP(rsServerName, master);
167    assertNotEquals(Procedure.NO_PROC_ID, pid);
168    ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), pid);
169    // After SCP, assert region is OPEN on new server.
170    r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
171    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
172      r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId))));
173    serverName = CatalogFamilyFormat.getServerName(r, 0);
174    assertNotNull(cluster.getRegionServer(serverName));
175    assertNotEquals(rsServerName, serverName);
176    // Make sure no mention of old server post SCP.
177    assertFalse(searchMeta(master, rsServerName));
178  }
179
180  protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
181    return hbckscpScheduler.scheduleHBCKSCP(rsServerName, master);
182  }
183
184  @Override
185  protected int getRegionReplication() {
186    return replicas;
187  }
188
189  /** Returns True if we find reference to <code>sn</code> in meta table. */
190  private boolean searchMeta(HMaster master, ServerName sn) throws IOException {
191    List<Pair<RegionInfo, ServerName>> ps =
192      MetaTableAccessor.getTableRegionsAndLocations(master.getConnection(), null);
193    for (Pair<RegionInfo, ServerName> p : ps) {
194      if (p.getSecond().equals(sn)) {
195        return true;
196      }
197    }
198    return false;
199  }
200
201  /**
202   * Encapsulates the choice of which HBCK2 method to call.
203   */
204  private abstract static class HBCKSCPScheduler {
205    abstract long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException;
206
207    @Override
208    public String toString() {
209      return this.getClass().getSimpleName();
210    }
211  }
212
213  /**
214   * Invokes {@code MasterRpcServices#scheduleServerCrashProcedure}.
215   */
216  private static class ScheduleServerCrashProcedure extends HBCKSCPScheduler {
217    @Override
218    public long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
219      MasterProtos.ScheduleServerCrashProcedureResponse response = master.getMasterRpcServices()
220        .scheduleServerCrashProcedure(null, MasterProtos.ScheduleServerCrashProcedureRequest
221          .newBuilder().addServerName(ProtobufUtil.toServerName(rsServerName)).build());
222      assertEquals(1, response.getPidCount());
223      return response.getPid(0);
224    }
225  }
226
227  /**
228   * Invokes {@code MasterRpcServices#scheduleSCPsForUnknownServers}.
229   */
230  private static class ScheduleSCPsForUnknownServers extends HBCKSCPScheduler {
231    @Override
232    long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
233      MasterProtos.ScheduleSCPsForUnknownServersResponse response =
234        master.getMasterRpcServices().scheduleSCPsForUnknownServers(null,
235          MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build());
236      assertEquals(1, response.getPidCount());
237      return response.getPid(0);
238    }
239  }
240
241  /**
242   * Encapsulates how the target region is selected.
243   */
244  private static abstract class RegionSelector {
245    abstract boolean regionFilter(RegionInfo info);
246
247    abstract Exception regionFilterFailure();
248
249    @Override
250    public String toString() {
251      return this.getClass().getSimpleName();
252    }
253  }
254
255  /**
256   * Selects a non-meta region that is also a primary region.
257   */
258  private static class PrimaryNotMetaRegionSelector extends RegionSelector {
259    @Override
260    boolean regionFilter(final RegionInfo info) {
261      return !Objects.equals(TableName.META_TABLE_NAME, info.getTable())
262        && Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId());
263    }
264
265    @Override
266    Exception regionFilterFailure() {
267      return new NoSuchElementException("Cannot locate a primary, non-meta region.");
268    }
269  }
270
271  /**
272   * Selects a non-meta region that is also a replica region.
273   */
274  private static class ReplicaNonMetaRegionSelector extends RegionSelector {
275    @Override
276    boolean regionFilter(RegionInfo info) {
277      return !Objects.equals(TableName.META_TABLE_NAME, info.getTable())
278        && !Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId());
279    }
280
281    @Override
282    Exception regionFilterFailure() {
283      return new NoSuchElementException("Cannot locate a replica, non-meta region.");
284    }
285  }
286}