001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import java.io.IOException; 021import java.util.ArrayList; 022import java.util.List; 023import java.util.stream.Collectors; 024import org.apache.hadoop.hbase.CatalogFamilyFormat; 025import org.apache.hadoop.hbase.ClientMetaTableAccessor; 026import org.apache.hadoop.hbase.HRegionLocation; 027import org.apache.hadoop.hbase.MetaTableAccessor; 028import org.apache.hadoop.hbase.RegionLocations; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.TableName; 031import org.apache.hadoop.hbase.client.Connection; 032import org.apache.hadoop.hbase.client.RegionInfo; 033import org.apache.hadoop.hbase.client.Result; 034import org.apache.hadoop.hbase.master.RegionState; 035import org.apache.hadoop.hbase.master.assignment.RegionStateNode; 036import org.apache.hadoop.hbase.master.assignment.RegionStateStore; 037import org.apache.yetus.audience.InterfaceAudience; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041/** 042 * Acts like the super class in all cases except when no Regions found in the current Master 043 * in-memory context. In this latter case, when the call to super#getRegionsOnCrashedServer returns 044 * nothing, this SCP will scan hbase:meta for references to the passed ServerName. If any found, 045 * we'll clean them up. 046 * <p> 047 * This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's 048 * scheduleRecoveries); the super class is used during normal recovery operations. It is for the 049 * case where meta has references to 'Unknown Servers', servers that are in hbase:meta but not in 050 * live-server or dead-server lists; i.e. Master and hbase:meta content have deviated. It should 051 * never happen in normal running cluster but if we do drop accounting of servers, we need a means 052 * of fix-up. Eventually, as part of normal CatalogJanitor task, rather than just identify these 053 * 'Unknown Servers', it would make repair, queuing something like this HBCKSCP to do cleanup, 054 * reassigning them so Master and hbase:meta are aligned again. 055 * <p> 056 * NOTE that this SCP is costly to run; does a full scan of hbase:meta. 057 * </p> 058 */ 059@InterfaceAudience.Private 060public class HBCKServerCrashProcedure extends ServerCrashProcedure { 061 private static final Logger LOG = LoggerFactory.getLogger(HBCKServerCrashProcedure.class); 062 063 /** 064 * @param serverName Name of the crashed server. 065 * @param shouldSplitWal True if we should split WALs as part of crashed server processing. 066 * @param carryingMeta True if carrying hbase:meta table region. 067 */ 068 public HBCKServerCrashProcedure(final MasterProcedureEnv env, final ServerName serverName, 069 final boolean shouldSplitWal, final boolean carryingMeta) { 070 super(env, serverName, shouldSplitWal, carryingMeta); 071 } 072 073 /** 074 * Used when deserializing from a procedure store; we'll construct one of these then call 075 * #deserializeStateData(InputStream). Do not use directly. 076 */ 077 public HBCKServerCrashProcedure() { 078 } 079 080 /** 081 * If no Regions found in Master context, then we will search hbase:meta for references to the 082 * passed server. Operator may have passed ServerName because they have found references to 083 * 'Unknown Servers'. They are using HBCKSCP to clear them out. 084 */ 085 @Override 086 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_NULL_ON_SOME_PATH_EXCEPTION", 087 justification = "FindBugs seems confused on ps in below.") 088 List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) { 089 // Super will return an immutable list (empty if nothing on this server). 090 List<RegionInfo> ris = super.getRegionsOnCrashedServer(env); 091 if (!ris.isEmpty()) { 092 return ris; 093 } 094 // Nothing in in-master context. Check for Unknown Server! in hbase:meta. 095 // If super list is empty, then allow that an operator scheduled an SCP because they are trying 096 // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers 097 // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'. 098 // This mis-accounting does not happen in normal circumstance but may arise in-extremis 099 // when cluster has been damaged in operation. 100 UnknownServerVisitor visitor = 101 new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName()); 102 try { 103 MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(), visitor, 104 null); 105 } catch (IOException ioe) { 106 LOG.warn("Failed scan of {} for 'Unknown Servers'", TableName.META_TABLE_NAME, ioe); 107 return ris; 108 } 109 // create the server state node too 110 env.getAssignmentManager().getRegionStates().createServer(getServerName()); 111 LOG.info("Found {} mentions of {} in {} of OPEN/OPENING Regions: {}", 112 visitor.getReassigns().size(), getServerName(), TableName.META_TABLE_NAME, visitor 113 .getReassigns().stream().map(RegionInfo::getEncodedName).collect(Collectors.joining(","))); 114 return visitor.getReassigns(); 115 } 116 117 /** 118 * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects a List of Regions to 119 * reassign as 'result'. 120 */ 121 private static final class UnknownServerVisitor implements ClientMetaTableAccessor.Visitor { 122 private final List<RegionInfo> reassigns = new ArrayList<>(); 123 private final ServerName unknownServerName; 124 private final Connection connection; 125 126 private UnknownServerVisitor(Connection connection, ServerName unknownServerName) { 127 this.connection = connection; 128 this.unknownServerName = unknownServerName; 129 } 130 131 @Override 132 public boolean visit(Result result) throws IOException { 133 RegionLocations rls = CatalogFamilyFormat.getRegionLocations(result); 134 if (rls == null) { 135 return true; 136 } 137 for (HRegionLocation hrl : rls.getRegionLocations()) { 138 if (hrl == null) { 139 continue; 140 } 141 if (hrl.getRegion() == null) { 142 continue; 143 } 144 if (hrl.getServerName() == null) { 145 continue; 146 } 147 if (!hrl.getServerName().equals(this.unknownServerName)) { 148 continue; 149 } 150 RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion()); 151 RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName()); 152 if (rs.isClosing()) { 153 // Move region to CLOSED in hbase:meta. 154 LOG.info("Moving {} from CLOSING to CLOSED in {}", 155 hrl.getRegion().getRegionNameAsString(), TableName.META_TABLE_NAME); 156 try { 157 MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(), 158 RegionState.State.CLOSED); 159 } catch (IOException ioe) { 160 LOG.warn("Failed moving {} from CLOSING to CLOSED", 161 hrl.getRegion().getRegionNameAsString(), ioe); 162 } 163 } else if (rs.isOpening() || rs.isOpened()) { 164 this.reassigns.add(hrl.getRegion()); 165 } else { 166 LOG.info("Passing {}", rs); 167 } 168 } 169 return true; 170 } 171 172 private List<RegionInfo> getReassigns() { 173 return this.reassigns; 174 } 175 } 176 177 /** 178 * The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail, the 179 * RegionStateNode regionLocation is set to null. This is 'looser' than the test done in the 180 * superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the behest of a 181 * report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation succeed even in 182 * case where the region location in the RegionStateNode is null. 183 */ 184 @Override 185 protected boolean isMatchingRegionLocation(RegionStateNode rsn) { 186 return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null; 187 } 188}