001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.List;
023import java.util.stream.Collectors;
024
025import org.apache.hadoop.hbase.HRegionLocation;
026import org.apache.hadoop.hbase.MetaTableAccessor;
027import org.apache.hadoop.hbase.RegionLocations;
028import org.apache.hadoop.hbase.ServerName;
029import org.apache.hadoop.hbase.client.Connection;
030import org.apache.hadoop.hbase.client.RegionInfo;
031import org.apache.hadoop.hbase.client.Result;
032import org.apache.hadoop.hbase.master.RegionState;
033import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
034import org.apache.yetus.audience.InterfaceAudience;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038/**
039 * Acts like the super class in all cases except when no Regions found in the
040 * current Master in-memory context. In this latter case, when the call to
041 * super#getRegionsOnCrashedServer returns nothing, this SCP will scan
042 * hbase:meta for references to the passed ServerName. If any found, we'll
043 * clean them up.
044 *
045 * <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
046 * scheduleRecoveries); the super class is used during normal recovery operations.
047 * It is for the case where meta has references to 'Unknown Servers',
048 * servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master
049 * and hbase:meta content have deviated. It should never happen in normal running
050 * cluster but if we do drop accounting of servers, we need a means of fix-up.
051 * Eventually, as part of normal CatalogJanitor task, rather than just identify
052 * these 'Unknown Servers', it would make repair, queuing something like this
053 * HBCKSCP to do cleanup, reassigning them so Master and hbase:meta are aligned again.
054 *
055 * <p>NOTE that this SCP is costly to run; does a full scan of hbase:meta.</p>
056 */
057@InterfaceAudience.Private
058public class HBCKServerCrashProcedure extends ServerCrashProcedure {
059  private static final Logger LOG = LoggerFactory.getLogger(HBCKServerCrashProcedure.class);
060
061  /**
062   * @param serverName Name of the crashed server.
063   * @param shouldSplitWal True if we should split WALs as part of crashed server processing.
064   * @param carryingMeta True if carrying hbase:meta table region.
065   */
066  public HBCKServerCrashProcedure(final MasterProcedureEnv env, final ServerName serverName,
067                              final boolean shouldSplitWal, final boolean carryingMeta) {
068    super(env, serverName, shouldSplitWal, carryingMeta);
069  }
070
071  /**
072   * Used when deserializing from a procedure store; we'll construct one of these then call
073   * #deserializeStateData(InputStream). Do not use directly.
074   */
075  public HBCKServerCrashProcedure() {}
076
077  /**
078   * If no Regions found in Master context, then we will search hbase:meta for references
079   * to the passed server. Operator may have passed ServerName because they have found
080   * references to 'Unknown Servers'. They are using HBCKSCP to clear them out.
081   */
082  @Override
083  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
084    justification="FindBugs seems confused on ps in below.")
085  List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
086    // Super will return an immutable list (empty if nothing on this server).
087    List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
088    if (!ris.isEmpty()) {
089      return ris;
090    }
091    // Nothing in in-master context. Check for Unknown Server! in hbase:meta.
092    // If super list is empty, then allow that an operator scheduled an SCP because they are trying
093    // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers
094    // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'.
095    // This mis-accounting does not happen in normal circumstance but may arise in-extremis
096    // when cluster has been damaged in operation.
097    UnknownServerVisitor visitor =
098        new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName());
099    try {
100      MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(),
101          visitor, null);
102    } catch (IOException ioe) {
103      LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe);
104      return ris;
105    }
106    LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}",
107        visitor.getReassigns().size(), getServerName(),
108        visitor.getReassigns().stream().map(RegionInfo::getEncodedName).
109            collect(Collectors.joining(",")));
110    return visitor.getReassigns();
111  }
112
113  /**
114   * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects
115   * a List of Regions to reassign as 'result'.
116   */
117  private static class UnknownServerVisitor implements MetaTableAccessor.Visitor {
118    private final List<RegionInfo> reassigns = new ArrayList<>();
119    private final ServerName unknownServerName;
120    private final Connection connection;
121
122    private UnknownServerVisitor(Connection connection, ServerName unknownServerName) {
123      this.connection = connection;
124      this.unknownServerName = unknownServerName;
125    }
126
127    @Override
128    public boolean visit(Result result) throws IOException {
129      RegionLocations rls = MetaTableAccessor.getRegionLocations(result);
130      if (rls == null) {
131        return true;
132      }
133      for (HRegionLocation hrl: rls.getRegionLocations()) {
134        if (hrl == null) {
135          continue;
136        }
137        if (hrl.getRegion() == null) {
138          continue;
139        }
140        if (hrl.getServerName() == null) {
141          continue;
142        }
143        if (!hrl.getServerName().equals(this.unknownServerName)) {
144          continue;
145        }
146        RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion());
147        RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName());
148        if (rs.isClosing()) {
149          // Move region to CLOSED in hbase:meta.
150          LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta",
151              hrl.getRegion().getRegionNameAsString());
152          try {
153            MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(),
154                RegionState.State.CLOSED);
155          } catch (IOException ioe) {
156            LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe);
157          }
158        } else if (rs.isOpening() || rs.isOpened()) {
159          this.reassigns.add(hrl.getRegion());
160        } else {
161          LOG.info("Passing {}", rs);
162        }
163      }
164      return true;
165    }
166
167    private List<RegionInfo> getReassigns() {
168      return this.reassigns;
169    }
170  }
171}