001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import java.io.IOException; 021import java.util.ArrayList; 022import java.util.List; 023import java.util.stream.Collectors; 024 025import org.apache.hadoop.hbase.HRegionLocation; 026import org.apache.hadoop.hbase.MetaTableAccessor; 027import org.apache.hadoop.hbase.RegionLocations; 028import org.apache.hadoop.hbase.ServerName; 029import org.apache.hadoop.hbase.client.Connection; 030import org.apache.hadoop.hbase.client.RegionInfo; 031import org.apache.hadoop.hbase.client.Result; 032import org.apache.hadoop.hbase.master.RegionState; 033import org.apache.hadoop.hbase.master.assignment.RegionStateStore; 034import org.apache.yetus.audience.InterfaceAudience; 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038/** 039 * Acts like the super class in all cases except when no Regions found in the 040 * current Master in-memory context. In this latter case, when the call to 041 * super#getRegionsOnCrashedServer returns nothing, this SCP will scan 042 * hbase:meta for references to the passed ServerName. If any found, we'll 043 * clean them up. 044 * 045 * <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's 046 * scheduleRecoveries); the super class is used during normal recovery operations. 047 * It is for the case where meta has references to 'Unknown Servers', 048 * servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master 049 * and hbase:meta content have deviated. It should never happen in normal running 050 * cluster but if we do drop accounting of servers, we need a means of fix-up. 051 * Eventually, as part of normal CatalogJanitor task, rather than just identify 052 * these 'Unknown Servers', it would make repair, queuing something like this 053 * HBCKSCP to do cleanup, reassigning them so Master and hbase:meta are aligned again. 054 * 055 * <p>NOTE that this SCP is costly to run; does a full scan of hbase:meta.</p> 056 */ 057@InterfaceAudience.Private 058public class HBCKServerCrashProcedure extends ServerCrashProcedure { 059 private static final Logger LOG = LoggerFactory.getLogger(HBCKServerCrashProcedure.class); 060 061 /** 062 * @param serverName Name of the crashed server. 063 * @param shouldSplitWal True if we should split WALs as part of crashed server processing. 064 * @param carryingMeta True if carrying hbase:meta table region. 065 */ 066 public HBCKServerCrashProcedure(final MasterProcedureEnv env, final ServerName serverName, 067 final boolean shouldSplitWal, final boolean carryingMeta) { 068 super(env, serverName, shouldSplitWal, carryingMeta); 069 } 070 071 /** 072 * Used when deserializing from a procedure store; we'll construct one of these then call 073 * #deserializeStateData(InputStream). Do not use directly. 074 */ 075 public HBCKServerCrashProcedure() {} 076 077 /** 078 * If no Regions found in Master context, then we will search hbase:meta for references 079 * to the passed server. Operator may have passed ServerName because they have found 080 * references to 'Unknown Servers'. They are using HBCKSCP to clear them out. 081 */ 082 @Override 083 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION", 084 justification="FindBugs seems confused on ps in below.") 085 List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) { 086 // Super will return an immutable list (empty if nothing on this server). 087 List<RegionInfo> ris = super.getRegionsOnCrashedServer(env); 088 if (!ris.isEmpty()) { 089 return ris; 090 } 091 // Nothing in in-master context. Check for Unknown Server! in hbase:meta. 092 // If super list is empty, then allow that an operator scheduled an SCP because they are trying 093 // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers 094 // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'. 095 // This mis-accounting does not happen in normal circumstance but may arise in-extremis 096 // when cluster has been damaged in operation. 097 UnknownServerVisitor visitor = 098 new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName()); 099 try { 100 MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(), 101 visitor, null); 102 } catch (IOException ioe) { 103 LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe); 104 return ris; 105 } 106 LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}", 107 visitor.getReassigns().size(), getServerName(), 108 visitor.getReassigns().stream().map(RegionInfo::getEncodedName). 109 collect(Collectors.joining(","))); 110 return visitor.getReassigns(); 111 } 112 113 /** 114 * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects 115 * a List of Regions to reassign as 'result'. 116 */ 117 private static class UnknownServerVisitor implements MetaTableAccessor.Visitor { 118 private final List<RegionInfo> reassigns = new ArrayList<>(); 119 private final ServerName unknownServerName; 120 private final Connection connection; 121 122 private UnknownServerVisitor(Connection connection, ServerName unknownServerName) { 123 this.connection = connection; 124 this.unknownServerName = unknownServerName; 125 } 126 127 @Override 128 public boolean visit(Result result) throws IOException { 129 RegionLocations rls = MetaTableAccessor.getRegionLocations(result); 130 if (rls == null) { 131 return true; 132 } 133 for (HRegionLocation hrl: rls.getRegionLocations()) { 134 if (hrl == null) { 135 continue; 136 } 137 if (hrl.getRegion() == null) { 138 continue; 139 } 140 if (hrl.getServerName() == null) { 141 continue; 142 } 143 if (!hrl.getServerName().equals(this.unknownServerName)) { 144 continue; 145 } 146 RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion()); 147 RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName()); 148 if (rs.isClosing()) { 149 // Move region to CLOSED in hbase:meta. 150 LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta", 151 hrl.getRegion().getRegionNameAsString()); 152 try { 153 MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(), 154 RegionState.State.CLOSED); 155 } catch (IOException ioe) { 156 LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe); 157 } 158 } else if (rs.isOpening() || rs.isOpened()) { 159 this.reassigns.add(hrl.getRegion()); 160 } else { 161 LOG.info("Passing {}", rs); 162 } 163 } 164 return true; 165 } 166 167 private List<RegionInfo> getReassigns() { 168 return this.reassigns; 169 } 170 } 171}