001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import java.io.IOException; 021import java.util.Set; 022import org.apache.hadoop.hbase.ServerName; 023import org.apache.hadoop.hbase.client.RegionInfo; 024import org.apache.hadoop.hbase.client.RegionInfoBuilder; 025import org.apache.hadoop.hbase.client.RegionReplicaUtil; 026import org.apache.hadoop.hbase.master.MasterServices; 027import org.apache.hadoop.hbase.master.assignment.AssignProcedure; 028import org.apache.hadoop.hbase.master.assignment.AssignmentManager; 029import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure; 030import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; 031import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; 032import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; 033import org.apache.hadoop.hbase.procedure2.StateMachineProcedure; 034import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; 035import org.apache.yetus.audience.InterfaceAudience; 036import org.apache.zookeeper.KeeperException; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 041 042import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 043import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos; 044import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverMetaState; 045 046 047/** 048 * This procedure recovers meta from prior shutdown/ crash of a server, and brings meta online by 049 * assigning meta region/s. Any place where meta is accessed and requires meta to be online, need to 050 * submit this procedure instead of duplicating steps to recover meta in the code. 051 * <p/> 052 * @deprecated Do not use any more, leave it here only for compatible. The recovery work will be 053 * done in {@link ServerCrashProcedure} directly, and the initial work for meta table 054 * will be done by {@link InitMetaProcedure}. 055 * @see ServerCrashProcedure 056 * @see InitMetaProcedure 057 */ 058@Deprecated 059@InterfaceAudience.Private 060public class RecoverMetaProcedure 061 extends StateMachineProcedure<MasterProcedureEnv, MasterProcedureProtos.RecoverMetaState> 062 implements MetaProcedureInterface { 063 private static final Logger LOG = LoggerFactory.getLogger(RecoverMetaProcedure.class); 064 065 private ServerName failedMetaServer; 066 private boolean shouldSplitWal; 067 private int replicaId; 068 069 private final ProcedurePrepareLatch syncLatch; 070 private MasterServices master; 071 072 /** 073 * Call this constructor to queue up a {@link RecoverMetaProcedure} in response to meta 074 * carrying server crash 075 * @param failedMetaServer failed/ crashed region server that was carrying meta 076 * @param shouldSplitLog split log file of meta region 077 */ 078 public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog) { 079 this(failedMetaServer, shouldSplitLog, null); 080 } 081 082 /** 083 * Constructor with latch, for blocking/ sync usage 084 */ 085 public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog, 086 final ProcedurePrepareLatch latch) { 087 this.failedMetaServer = failedMetaServer; 088 this.shouldSplitWal = shouldSplitLog; 089 this.replicaId = RegionInfo.DEFAULT_REPLICA_ID; 090 this.syncLatch = latch; 091 } 092 093 /** 094 * This constructor is also used when deserializing from a procedure store; we'll construct one 095 * of these then call #deserializeStateData(InputStream). Do not use directly. 096 */ 097 public RecoverMetaProcedure() { 098 this(null, false); 099 } 100 101 @Override 102 protected Flow executeFromState(MasterProcedureEnv env, 103 MasterProcedureProtos.RecoverMetaState state) 104 throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { 105 prepare(env); 106 107 if (!isRunRequired()) { 108 LOG.info(this + "; Meta already initialized. Skipping run"); 109 return Flow.NO_MORE_STATE; 110 } 111 112 try { 113 switch (state) { 114 case RECOVER_META_PREPARE: 115 // If Master is going down or cluster is up, skip this assign by returning NO_MORE_STATE 116 if (!master.isClusterUp()) { 117 String msg = "Cluster not up! Skipping hbase:meta assign."; 118 LOG.warn(msg); 119 return Flow.NO_MORE_STATE; 120 } 121 if (master.isStopping() || master.isStopped()) { 122 String msg = "Master stopping=" + master.isStopping() + ", stopped=" + 123 master.isStopped() + "; skipping hbase:meta assign."; 124 LOG.warn(msg); 125 return Flow.NO_MORE_STATE; 126 } 127 setNextState(RecoverMetaState.RECOVER_META_SPLIT_LOGS); 128 break; 129 case RECOVER_META_SPLIT_LOGS: 130 LOG.info("Start " + this); 131 if (shouldSplitWal) { 132 // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment. 133 AssignmentManager am = env.getMasterServices().getAssignmentManager(); 134 if (failedMetaServer != null) { 135 am.getRegionStates().metaLogSplitting(failedMetaServer); 136 master.getMasterWalManager().splitMetaLog(failedMetaServer); 137 am.getRegionStates().metaLogSplit(failedMetaServer); 138 } else { 139 ServerName serverName = 140 master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper()); 141 Set<ServerName> previouslyFailedServers = 142 master.getMasterWalManager().getFailedServersFromLogFolders(); 143 if (serverName != null && previouslyFailedServers.contains(serverName)) { 144 am.getRegionStates().metaLogSplitting(serverName); 145 master.getMasterWalManager().splitMetaLog(serverName); 146 am.getRegionStates().metaLogSplit(serverName); 147 } 148 } 149 } 150 setNextState(RecoverMetaState.RECOVER_META_ASSIGN_REGIONS); 151 break; 152 case RECOVER_META_ASSIGN_REGIONS: 153 RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica( 154 RegionInfoBuilder.FIRST_META_REGIONINFO, this.replicaId); 155 156 AssignProcedure metaAssignProcedure; 157 AssignmentManager am = master.getAssignmentManager(); 158 if (failedMetaServer != null) { 159 handleRIT(env, hri, this.failedMetaServer); 160 LOG.info(this + "; Assigning meta with new plan; previous server=" + failedMetaServer); 161 metaAssignProcedure = am.createAssignProcedure(hri); 162 } else { 163 // get server carrying meta from zk 164 ServerName metaServer = 165 MetaTableLocator.getMetaRegionState(master.getZooKeeper()).getServerName(); 166 LOG.info(this + "; Retaining meta assignment to server=" + metaServer); 167 metaAssignProcedure = am.createAssignProcedure(hri, metaServer); 168 } 169 170 addChildProcedure(metaAssignProcedure); 171 return Flow.NO_MORE_STATE; 172 173 default: 174 throw new UnsupportedOperationException("unhandled state=" + state); 175 } 176 } catch (IOException|KeeperException e) { 177 LOG.warn(this + "; Failed state=" + state + ", retry " + this + "; cycles=" + 178 getCycles(), e); 179 } 180 return Flow.HAS_MORE_STATE; 181 } 182 183 /** 184 * Is the region stuck assigning to this failedMetaServer? If so, cancel the call 185 * just as we do over in ServerCrashProcedure#handleRIT except less to do here; less context 186 * to carry. 187 */ 188 // NOTE: Make sure any fix or improvement done here is also done in SCP#handleRIT; the methods 189 // have overlap. 190 private void handleRIT(MasterProcedureEnv env, RegionInfo ri, ServerName crashedServerName) { 191 AssignmentManager am = env.getAssignmentManager(); 192 RegionTransitionProcedure rtp = am.getRegionStates().getRegionTransitionProcedure(ri); 193 if (rtp == null) { 194 return; // Nothing to do. Not in RIT. 195 } 196 // Make sure the RIT is against this crashed server. In the case where there are many 197 // processings of a crashed server -- backed up for whatever reason (slow WAL split) 198 // -- then a previous SCP may have already failed an assign, etc., and it may have a 199 // new location target; DO NOT fail these else we make for assign flux. 200 ServerName rtpServerName = rtp.getServer(env); 201 if (rtpServerName == null) { 202 LOG.warn("RIT with ServerName null! " + rtp); 203 } else if (rtpServerName.equals(crashedServerName)) { 204 LOG.info("pid=" + getProcId() + " found RIT " + rtp + "; " + 205 rtp.getRegionState(env).toShortString()); 206 rtp.remoteCallFailed(env, crashedServerName, 207 new ServerCrashException(getProcId(), crashedServerName)); 208 } 209 } 210 211 @Override 212 protected void rollbackState(MasterProcedureEnv env, 213 MasterProcedureProtos.RecoverMetaState recoverMetaState) 214 throws IOException, InterruptedException { 215 // Can't rollback 216 throw new UnsupportedOperationException("unhandled state=" + recoverMetaState); 217 } 218 219 @Override 220 protected MasterProcedureProtos.RecoverMetaState getState(int stateId) { 221 return RecoverMetaState.forNumber(stateId); 222 } 223 224 @Override 225 protected int getStateId(MasterProcedureProtos.RecoverMetaState recoverMetaState) { 226 return recoverMetaState.getNumber(); 227 } 228 229 @Override 230 protected MasterProcedureProtos.RecoverMetaState getInitialState() { 231 return RecoverMetaState.RECOVER_META_PREPARE; 232 } 233 234 @Override 235 protected void toStringClassDetails(StringBuilder sb) { 236 sb.append(getClass().getSimpleName()); 237 sb.append(" failedMetaServer="); 238 sb.append(failedMetaServer); 239 sb.append(", splitWal="); 240 sb.append(shouldSplitWal); 241 } 242 243 @Override 244 protected void serializeStateData(ProcedureStateSerializer serializer) 245 throws IOException { 246 super.serializeStateData(serializer); 247 MasterProcedureProtos.RecoverMetaStateData.Builder state = 248 MasterProcedureProtos.RecoverMetaStateData.newBuilder().setShouldSplitWal(shouldSplitWal); 249 if (failedMetaServer != null) { 250 state.setFailedMetaServer(ProtobufUtil.toServerName(failedMetaServer)); 251 } 252 state.setReplicaId(replicaId); 253 serializer.serialize(state.build()); 254 } 255 256 @Override 257 protected void deserializeStateData(ProcedureStateSerializer serializer) 258 throws IOException { 259 super.deserializeStateData(serializer); 260 MasterProcedureProtos.RecoverMetaStateData state = 261 serializer.deserialize(MasterProcedureProtos.RecoverMetaStateData.class); 262 this.shouldSplitWal = state.hasShouldSplitWal() && state.getShouldSplitWal(); 263 this.failedMetaServer = state.hasFailedMetaServer() ? 264 ProtobufUtil.toServerName(state.getFailedMetaServer()) : null; 265 this.replicaId = state.hasReplicaId() ? state.getReplicaId() : RegionInfo.DEFAULT_REPLICA_ID; 266 } 267 268 @Override 269 protected LockState acquireLock(MasterProcedureEnv env) { 270 if (env.getProcedureScheduler().waitMetaExclusiveLock(this)) { 271 return LockState.LOCK_EVENT_WAIT; 272 } 273 return LockState.LOCK_ACQUIRED; 274 } 275 276 @Override 277 protected void releaseLock(MasterProcedureEnv env) { 278 env.getProcedureScheduler().wakeMetaExclusiveLock(this); 279 } 280 281 @Override 282 protected void completionCleanup(MasterProcedureEnv env) { 283 ProcedurePrepareLatch.releaseLatch(syncLatch, this); 284 } 285 286 /** 287 * @return true if failedMetaServer is not null (meta carrying server crashed) or meta is 288 * already initialized 289 */ 290 private boolean isRunRequired() { 291 return failedMetaServer != null || !master.getAssignmentManager().isMetaAssigned(); 292 } 293 294 /** 295 * Prepare for execution 296 */ 297 private void prepare(MasterProcedureEnv env) { 298 if (master == null) { 299 master = env.getMasterServices(); 300 Preconditions.checkArgument(master != null); 301 } 302 } 303}