001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.List;
023import java.util.stream.Collectors;
024import org.apache.hadoop.hbase.CatalogFamilyFormat;
025import org.apache.hadoop.hbase.ClientMetaTableAccessor;
026import org.apache.hadoop.hbase.HRegionLocation;
027import org.apache.hadoop.hbase.MetaTableAccessor;
028import org.apache.hadoop.hbase.RegionLocations;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.TableName;
031import org.apache.hadoop.hbase.client.Connection;
032import org.apache.hadoop.hbase.client.RegionInfo;
033import org.apache.hadoop.hbase.client.Result;
034import org.apache.hadoop.hbase.master.RegionState;
035import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
036import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
037import org.apache.yetus.audience.InterfaceAudience;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041/**
042 * Acts like the super class in all cases except when no Regions found in the current Master
043 * in-memory context. In this latter case, when the call to super#getRegionsOnCrashedServer returns
044 * nothing, this SCP will scan hbase:meta for references to the passed ServerName. If any found,
045 * we'll clean them up.
046 * <p>
047 * This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
048 * scheduleRecoveries); the super class is used during normal recovery operations. It is for the
049 * case where meta has references to 'Unknown Servers', servers that are in hbase:meta but not in
050 * live-server or dead-server lists; i.e. Master and hbase:meta content have deviated. It should
051 * never happen in normal running cluster but if we do drop accounting of servers, we need a means
052 * of fix-up. Eventually, as part of normal CatalogJanitor task, rather than just identify these
053 * 'Unknown Servers', it would make repair, queuing something like this HBCKSCP to do cleanup,
054 * reassigning them so Master and hbase:meta are aligned again.
055 * <p>
056 * NOTE that this SCP is costly to run; does a full scan of hbase:meta.
057 * </p>
058 */
059@InterfaceAudience.Private
060public class HBCKServerCrashProcedure extends ServerCrashProcedure {
061  private static final Logger LOG = LoggerFactory.getLogger(HBCKServerCrashProcedure.class);
062
063  /**
064   * @param serverName     Name of the crashed server.
065   * @param shouldSplitWal True if we should split WALs as part of crashed server processing.
066   * @param carryingMeta   True if carrying hbase:meta table region.
067   */
068  public HBCKServerCrashProcedure(final MasterProcedureEnv env, final ServerName serverName,
069    final boolean shouldSplitWal, final boolean carryingMeta) {
070    super(env, serverName, shouldSplitWal, carryingMeta);
071  }
072
073  /**
074   * Used when deserializing from a procedure store; we'll construct one of these then call
075   * #deserializeStateData(InputStream). Do not use directly.
076   */
077  public HBCKServerCrashProcedure() {
078  }
079
080  /**
081   * If no Regions found in Master context, then we will search hbase:meta for references to the
082   * passed server. Operator may have passed ServerName because they have found references to
083   * 'Unknown Servers'. They are using HBCKSCP to clear them out.
084   */
085  @Override
086  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_NULL_ON_SOME_PATH_EXCEPTION",
087      justification = "FindBugs seems confused on ps in below.")
088  List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
089    // Super will return an immutable list (empty if nothing on this server).
090    List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
091    if (!ris.isEmpty()) {
092      return ris;
093    }
094    // Nothing in in-master context. Check for Unknown Server! in hbase:meta.
095    // If super list is empty, then allow that an operator scheduled an SCP because they are trying
096    // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers
097    // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'.
098    // This mis-accounting does not happen in normal circumstance but may arise in-extremis
099    // when cluster has been damaged in operation.
100    UnknownServerVisitor visitor =
101      new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName());
102    try {
103      MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(), visitor,
104        null);
105    } catch (IOException ioe) {
106      LOG.warn("Failed scan of {} for 'Unknown Servers'", TableName.META_TABLE_NAME, ioe);
107      return ris;
108    }
109    // create the server state node too
110    env.getAssignmentManager().getRegionStates().createServer(getServerName());
111    LOG.info("Found {} mentions of {} in {} of OPEN/OPENING Regions: {}",
112      visitor.getReassigns().size(), getServerName(), TableName.META_TABLE_NAME, visitor
113        .getReassigns().stream().map(RegionInfo::getEncodedName).collect(Collectors.joining(",")));
114    return visitor.getReassigns();
115  }
116
117  /**
118   * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects a List of Regions to
119   * reassign as 'result'.
120   */
121  private static final class UnknownServerVisitor implements ClientMetaTableAccessor.Visitor {
122    private final List<RegionInfo> reassigns = new ArrayList<>();
123    private final ServerName unknownServerName;
124    private final Connection connection;
125
126    private UnknownServerVisitor(Connection connection, ServerName unknownServerName) {
127      this.connection = connection;
128      this.unknownServerName = unknownServerName;
129    }
130
131    @Override
132    public boolean visit(Result result) throws IOException {
133      RegionLocations rls = CatalogFamilyFormat.getRegionLocations(result);
134      if (rls == null) {
135        return true;
136      }
137      for (HRegionLocation hrl : rls.getRegionLocations()) {
138        if (hrl == null) {
139          continue;
140        }
141        if (hrl.getRegion() == null) {
142          continue;
143        }
144        if (hrl.getServerName() == null) {
145          continue;
146        }
147        if (!hrl.getServerName().equals(this.unknownServerName)) {
148          continue;
149        }
150        RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion());
151        RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName());
152        if (rs.isClosing()) {
153          // Move region to CLOSED in hbase:meta.
154          LOG.info("Moving {} from CLOSING to CLOSED in {}",
155            hrl.getRegion().getRegionNameAsString(), TableName.META_TABLE_NAME);
156          try {
157            MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(),
158              RegionState.State.CLOSED);
159          } catch (IOException ioe) {
160            LOG.warn("Failed moving {} from CLOSING to CLOSED",
161              hrl.getRegion().getRegionNameAsString(), ioe);
162          }
163        } else if (rs.isOpening() || rs.isOpened()) {
164          this.reassigns.add(hrl.getRegion());
165        } else {
166          LOG.info("Passing {}", rs);
167        }
168      }
169      return true;
170    }
171
172    private List<RegionInfo> getReassigns() {
173      return this.reassigns;
174    }
175  }
176
177  /**
178   * The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail, the
179   * RegionStateNode regionLocation is set to null. This is 'looser' than the test done in the
180   * superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the behest of a
181   * report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation succeed even in
182   * case where the region location in the RegionStateNode is null.
183   */
184  @Override
185  protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
186    return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null;
187  }
188}