001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import static junit.framework.TestCase.assertFalse;
021import static junit.framework.TestCase.assertNotNull;
022import static org.junit.Assert.assertEquals;
023import static org.junit.Assert.assertNotEquals;
024import static org.junit.Assert.assertTrue;
025
026import java.io.IOException;
027import java.util.List;
028import java.util.NoSuchElementException;
029import java.util.Objects;
030import java.util.concurrent.TimeUnit;
031import org.apache.hadoop.hbase.CatalogFamilyFormat;
032import org.apache.hadoop.hbase.HBaseClassTestRule;
033import org.apache.hadoop.hbase.HBaseServerBase;
034import org.apache.hadoop.hbase.HBaseTestingUtil;
035import org.apache.hadoop.hbase.HConstants;
036import org.apache.hadoop.hbase.MetaTableAccessor;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
039import org.apache.hadoop.hbase.TableName;
040import org.apache.hadoop.hbase.TableNameTestRule;
041import org.apache.hadoop.hbase.client.RegionInfo;
042import org.apache.hadoop.hbase.client.Result;
043import org.apache.hadoop.hbase.client.Table;
044import org.apache.hadoop.hbase.master.HMaster;
045import org.apache.hadoop.hbase.master.RegionState;
046import org.apache.hadoop.hbase.procedure2.Procedure;
047import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
048import org.apache.hadoop.hbase.regionserver.HRegionServer;
049import org.apache.hadoop.hbase.testclassification.LargeTests;
050import org.apache.hadoop.hbase.testclassification.MasterTests;
051import org.apache.hadoop.hbase.util.Bytes;
052import org.apache.hadoop.hbase.util.JVMClusterUtil;
053import org.apache.hadoop.hbase.util.Pair;
054import org.junit.ClassRule;
055import org.junit.Rule;
056import org.junit.Test;
057import org.junit.experimental.categories.Category;
058import org.junit.runner.RunWith;
059import org.junit.runners.Parameterized;
060import org.slf4j.Logger;
061import org.slf4j.LoggerFactory;
062
063import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
064
065import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
066import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
067
068/**
069 * Test of the HBCK-version of SCP. The HBCKSCP is an SCP only it reads hbase:meta for list of
070 * Regions that were on the server-to-process rather than consult Master in-memory-state.
071 */
072@Category({ MasterTests.class, LargeTests.class })
073@RunWith(Parameterized.class)
074public class TestHBCKSCP extends TestSCPBase {
075  private static final Logger LOG = LoggerFactory.getLogger(TestHBCKSCP.class);
076
077  @ClassRule
078  public static final HBaseClassTestRule CLASS_RULE =
079    HBaseClassTestRule.forClass(TestHBCKSCP.class);
080  @Rule
081  public TableNameTestRule tableNameTestRule = new TableNameTestRule();
082
083  private final int replicas;
084  private final HBCKSCPScheduler hbckscpScheduler;
085  private final RegionSelector regionSelector;
086
087  public TestHBCKSCP(final int replicas, final HBCKSCPScheduler hbckscpScheduler,
088    final RegionSelector regionSelector) {
089    this.replicas = replicas;
090    this.hbckscpScheduler = hbckscpScheduler;
091    this.regionSelector = regionSelector;
092  }
093
094  @Parameterized.Parameters(name = "replicas:{0} scheduler:{1} selector:{2}")
095  public static Object[][] params() {
096    return new Object[][] {
097      { 1, new ScheduleServerCrashProcedure(), new PrimaryNotMetaRegionSelector() },
098      { 3, new ScheduleServerCrashProcedure(), new ReplicaNonMetaRegionSelector() },
099      { 1, new ScheduleSCPsForUnknownServers(), new PrimaryNotMetaRegionSelector() },
100      { 3, new ScheduleSCPsForUnknownServers(), new ReplicaNonMetaRegionSelector() } };
101  }
102
103  @Test
104  public void test() throws Exception {
105    // we are about to do one for it?
106    SingleProcessHBaseCluster cluster = this.util.getHBaseCluster();
107
108    // Assert that we have three RegionServers. Test depends on there being multiple.
109    assertEquals(RS_COUNT, cluster.getLiveRegionServerThreads().size());
110
111    int count;
112    try (Table table = createTable(tableNameTestRule.getTableName())) {
113      // Load the table with a bit of data so some logs to split and some edits in each region.
114      this.util.loadTable(table, HBaseTestingUtil.COLUMNS[0]);
115      count = HBaseTestingUtil.countRows(table);
116    }
117    assertTrue("expected some rows", count > 0);
118
119    // Make the test easier by not working on server hosting meta...
120    // Find another RS. Purge it from Master memory w/o running SCP (if
121    // SCP runs, it will clear entries from hbase:meta which frustrates
122    // our attempt at manufacturing 'Unknown Servers' condition).
123    final ServerName metaServer = util.getMiniHBaseCluster().getServerHoldingMeta();
124    final ServerName rsServerName = cluster.getRegionServerThreads().stream()
125      .map(JVMClusterUtil.RegionServerThread::getRegionServer).map(HBaseServerBase::getServerName)
126      .filter(sn -> !sn.equals(metaServer)).findAny().orElseThrow(() -> new NoSuchElementException(
127        "Cannot locate a region server that is not hosting meta."));
128    HMaster master = cluster.getMaster();
129    // Get a Region that is on the server.
130    final List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(rsServerName);
131    LOG.debug("{} is holding {} regions.", rsServerName, regions.size());
132    final RegionInfo rsRI =
133      regions.stream().peek(info -> LOG.debug("{}", info)).filter(regionSelector::regionFilter)
134        .findAny().orElseThrow(regionSelector::regionFilterFailure);
135    final int replicaId = rsRI.getReplicaId();
136    Result r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
137    // Assert region is OPEN.
138    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
139      r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId))));
140    ServerName serverName = CatalogFamilyFormat.getServerName(r, replicaId);
141    assertEquals(rsServerName, serverName);
142    // moveFrom adds to dead servers and adds it to processing list only we will
143    // not be processing this server 'normally'. Remove it from processing by
144    // calling 'finish' and then remove it from dead servers so rsServerName
145    // becomes an 'Unknown Server' even though it is still around.
146    LOG.info("Killing {}", rsServerName);
147    cluster.killRegionServer(rsServerName);
148
149    master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
150    master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
151    master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
152    // Kill the server. Nothing should happen since an 'Unknown Server' as far
153    // as the Master is concerned; i.e. no SCP.
154    HRegionServer hrs = cluster.getRegionServer(rsServerName);
155    util.waitFor(TimeUnit.MINUTES.toMillis(1), hrs::isStopped);
156    LOG.info("Dead {}", rsServerName);
157    // Now assert still references in hbase:meta to the 'dead' server -- they haven't been
158    // cleaned up by an SCP or by anything else.
159    assertTrue(searchMeta(master, rsServerName));
160    // Assert region is OPEN on dead server still.
161    r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
162    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
163      r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId))));
164    serverName = CatalogFamilyFormat.getServerName(r, replicaId);
165    assertNotNull(cluster.getRegionServer(serverName));
166    assertEquals(rsServerName, serverName);
167
168    // I now have 'Unknown Server' references in hbase:meta; i.e. Server references
169    // with no corresponding SCP. Queue one.
170    long pid = scheduleHBCKSCP(rsServerName, master);
171    assertNotEquals(Procedure.NO_PROC_ID, pid);
172    ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), pid);
173    // After SCP, assert region is OPEN on new server.
174    r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
175    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
176      r.getValue(HConstants.CATALOG_FAMILY, CatalogFamilyFormat.getRegionStateColumn(replicaId))));
177    serverName = CatalogFamilyFormat.getServerName(r, 0);
178    assertNotNull(cluster.getRegionServer(serverName));
179    assertNotEquals(rsServerName, serverName);
180    // Make sure no mention of old server post SCP.
181    assertFalse(searchMeta(master, rsServerName));
182  }
183
184  protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
185    return hbckscpScheduler.scheduleHBCKSCP(rsServerName, master);
186  }
187
188  @Override
189  protected int getRegionReplication() {
190    return replicas;
191  }
192
193  /** Returns True if we find reference to <code>sn</code> in meta table. */
194  private boolean searchMeta(HMaster master, ServerName sn) throws IOException {
195    List<Pair<RegionInfo, ServerName>> ps =
196      MetaTableAccessor.getTableRegionsAndLocations(master.getConnection(), null);
197    for (Pair<RegionInfo, ServerName> p : ps) {
198      if (p.getSecond().equals(sn)) {
199        return true;
200      }
201    }
202    return false;
203  }
204
205  /**
206   * Encapsulates the choice of which HBCK2 method to call.
207   */
208  private abstract static class HBCKSCPScheduler {
209    abstract long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException;
210
211    @Override
212    public String toString() {
213      return this.getClass().getSimpleName();
214    }
215  }
216
217  /**
218   * Invokes {@code MasterRpcServices#scheduleServerCrashProcedure}.
219   */
220  private static class ScheduleServerCrashProcedure extends HBCKSCPScheduler {
221    @Override
222    public long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
223      MasterProtos.ScheduleServerCrashProcedureResponse response = master.getMasterRpcServices()
224        .scheduleServerCrashProcedure(null, MasterProtos.ScheduleServerCrashProcedureRequest
225          .newBuilder().addServerName(ProtobufUtil.toServerName(rsServerName)).build());
226      assertEquals(1, response.getPidCount());
227      return response.getPid(0);
228    }
229  }
230
231  /**
232   * Invokes {@code MasterRpcServices#scheduleSCPsForUnknownServers}.
233   */
234  private static class ScheduleSCPsForUnknownServers extends HBCKSCPScheduler {
235    @Override
236    long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
237      MasterProtos.ScheduleSCPsForUnknownServersResponse response =
238        master.getMasterRpcServices().scheduleSCPsForUnknownServers(null,
239          MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build());
240      assertEquals(1, response.getPidCount());
241      return response.getPid(0);
242    }
243  }
244
245  /**
246   * Encapsulates how the target region is selected.
247   */
248  private static abstract class RegionSelector {
249    abstract boolean regionFilter(RegionInfo info);
250
251    abstract Exception regionFilterFailure();
252
253    @Override
254    public String toString() {
255      return this.getClass().getSimpleName();
256    }
257  }
258
259  /**
260   * Selects a non-meta region that is also a primary region.
261   */
262  private static class PrimaryNotMetaRegionSelector extends RegionSelector {
263    @Override
264    boolean regionFilter(final RegionInfo info) {
265      return !Objects.equals(TableName.META_TABLE_NAME, info.getTable())
266        && Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId());
267    }
268
269    @Override
270    Exception regionFilterFailure() {
271      return new NoSuchElementException("Cannot locate a primary, non-meta region.");
272    }
273  }
274
275  /**
276   * Selects a non-meta region that is also a replica region.
277   */
278  private static class ReplicaNonMetaRegionSelector extends RegionSelector {
279    @Override
280    boolean regionFilter(RegionInfo info) {
281      return !Objects.equals(TableName.META_TABLE_NAME, info.getTable())
282        && !Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId());
283    }
284
285    @Override
286    Exception regionFilterFailure() {
287      return new NoSuchElementException("Cannot locate a replica, non-meta region.");
288    }
289  }
290}