View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.procedure;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InterruptedIOException;
23  import java.io.OutputStream;
24  import java.util.ArrayList;
25  import java.util.Collection;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Set;
31  import java.util.concurrent.locks.Lock;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.hbase.HConstants;
36  import org.apache.hadoop.hbase.HRegionInfo;
37  import org.apache.hadoop.hbase.ServerName;
38  import org.apache.hadoop.hbase.client.ClusterConnection;
39  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
40  import org.apache.hadoop.hbase.master.AssignmentManager;
41  import org.apache.hadoop.hbase.master.MasterFileSystem;
42  import org.apache.hadoop.hbase.master.MasterServices;
43  import org.apache.hadoop.hbase.master.RegionState;
44  import org.apache.hadoop.hbase.master.RegionStates;
45  import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
46  import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
47  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
48  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionInfo;
49  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos;
50  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.ServerCrashState;
51  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
52  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
53  import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
54  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
55  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
56  import org.apache.hadoop.util.StringUtils;
57  import org.apache.zookeeper.KeeperException;
58  
59  /**
60   * Handle crashed server. This is a port to ProcedureV2 of what used to be euphemistically called
61   * ServerShutdownHandler.
62   *
63   * <p>The procedure flow varies dependent on whether meta is assigned, if we are
64   * doing distributed log replay versus distributed log splitting, and if we are to split logs at
65   * all.
66   *
67   * <p>This procedure asks that all crashed servers get processed equally; we yield after the
68   * completion of each successful flow step. We do this so that we do not 'deadlock' waiting on
69   * a region assignment so we can replay edits which could happen if a region moved there are edits
70   * on two servers for replay.
71   *
72   * <p>TODO: ASSIGN and WAIT_ON_ASSIGN (at least) are not idempotent. Revisit when assign is pv2.
73   * TODO: We do not have special handling for system tables.
74   */
75  public class ServerCrashProcedure
76  extends StateMachineProcedure<MasterProcedureEnv, ServerCrashState>
77  implements ServerProcedureInterface {
78    private static final Log LOG = LogFactory.getLog(ServerCrashProcedure.class);
79  
80    /**
81     * Configuration key to set how long to wait in ms doing a quick check on meta state.
82     */
83    public static final String KEY_SHORT_WAIT_ON_META =
84        "hbase.master.servercrash.short.wait.on.meta.ms";
85  
86    public static final int DEFAULT_SHORT_WAIT_ON_META = 1000;
87  
88    /**
89     * Configuration key to set how many retries to cycle before we give up on meta.
90     * Each attempt will wait at least {@link #KEY_SHORT_WAIT_ON_META} milliseconds.
91     */
92    public static final String KEY_RETRIES_ON_META =
93        "hbase.master.servercrash.meta.retries";
94  
95    public static final int DEFAULT_RETRIES_ON_META = 10;
96  
97    /**
98     * Configuration key to set how long to wait in ms on regions in transition.
99     */
100   public static final String KEY_WAIT_ON_RIT =
101       "hbase.master.servercrash.wait.on.rit.ms";
102 
103   public static final int DEFAULT_WAIT_ON_RIT = 30000;
104 
105   private static final Set<HRegionInfo> META_REGION_SET = new HashSet<HRegionInfo>();
106   static {
107     META_REGION_SET.add(HRegionInfo.FIRST_META_REGIONINFO);
108   }
109 
110   /**
111    * Name of the crashed server to process.
112    */
113   private ServerName serverName;
114 
115   /**
116    * Whether DeadServer knows that we are processing it.
117    */
118   private boolean notifiedDeadServer = false;
119 
120   /**
121    * Regions that were on the crashed server.
122    */
123   private Set<HRegionInfo> regionsOnCrashedServer;
124 
125   /**
126    * Regions assigned. Usually some subset of {@link #regionsOnCrashedServer}.
127    */
128   private List<HRegionInfo> regionsAssigned;
129 
130   private boolean distributedLogReplay = false;
131   private boolean carryingMeta = false;
132   private boolean shouldSplitWal;
133 
134   /**
135    * Cycles on same state. Good for figuring if we are stuck.
136    */
137   private int cycles = 0;
138 
139   /**
140    * Ordinal of the previous state. So we can tell if we are progressing or not. TODO: if useful,
141    * move this back up into StateMachineProcedure
142    */
143   private int previousState;
144 
145   /**
146    * Call this constructor queuing up a Procedure.
147    * @param serverName Name of the crashed server.
148    * @param shouldSplitWal True if we should split WALs as part of crashed server processing.
149    * @param carryingMeta True if carrying hbase:meta table region.
150    */
151   public ServerCrashProcedure(
152       final MasterProcedureEnv env,
153       final ServerName serverName,
154       final boolean shouldSplitWal,
155       final boolean carryingMeta) {
156     this.serverName = serverName;
157     this.shouldSplitWal = shouldSplitWal;
158     this.carryingMeta = carryingMeta;
159     this.setOwner(env.getRequestUser().getShortName());
160   }
161 
162   /**
163    * Used when deserializing from a procedure store; we'll construct one of these then call
164    * {@link #deserializeStateData(InputStream)}. Do not use directly.
165    */
166   public ServerCrashProcedure() {
167     super();
168   }
169 
170   private void throwProcedureYieldException(final String msg) throws ProcedureYieldException {
171     String logMsg = msg + "; cycle=" + this.cycles + ", running for " +
172         StringUtils.formatTimeDiff(System.currentTimeMillis(), getStartTime());
173     // The procedure executor logs ProcedureYieldException at trace level. For now, log these
174     // yields for server crash processing at DEBUG. Revisit when stable.
175     if (LOG.isDebugEnabled()) LOG.debug(logMsg);
176     throw new ProcedureYieldException(logMsg);
177   }
178 
179   @Override
180   protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
181       throws ProcedureYieldException {
182     if (LOG.isTraceEnabled()) {
183       LOG.trace(state);
184     }
185     // Keep running count of cycles
186     if (state.ordinal() != this.previousState) {
187       this.previousState = state.ordinal();
188       this.cycles = 0;
189     } else {
190       this.cycles++;
191     }
192     MasterServices services = env.getMasterServices();
193     // Is master fully online? If not, yield. No processing of servers unless master is up
194     if (!services.getAssignmentManager().isFailoverCleanupDone()) {
195       throwProcedureYieldException("Waiting on master failover to complete");
196     }
197     // HBASE-14802
198     // If we have not yet notified that we are processing a dead server, we should do now.
199     if (!notifiedDeadServer) {
200       services.getServerManager().getDeadServers().notifyServer(serverName);
201       notifiedDeadServer = true;
202     }
203 
204     try {
205       switch (state) {
206       case SERVER_CRASH_START:
207         LOG.info("Start processing crashed " + this.serverName);
208         start(env);
209         // If carrying meta, process it first. Else, get list of regions on crashed server.
210         if (this.carryingMeta) setNextState(ServerCrashState.SERVER_CRASH_PROCESS_META);
211         else setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
212         break;
213 
214       case SERVER_CRASH_GET_REGIONS:
215         // If hbase:meta is not assigned, yield.
216         if (!isMetaAssignedQuickTest(env)) {
217           // isMetaAssignedQuickTest does not really wait. Let's delay a little before
218           // another round of execution.
219           long wait =
220               env.getMasterConfiguration().getLong(KEY_SHORT_WAIT_ON_META,
221                 DEFAULT_SHORT_WAIT_ON_META);
222           wait = wait / 10;
223           Thread.sleep(wait);
224           throwProcedureYieldException("Waiting on hbase:meta assignment");
225         }
226         this.regionsOnCrashedServer =
227             services.getAssignmentManager().getRegionStates().getServerRegions(this.serverName);
228         // Where to go next? Depends on whether we should split logs at all or if we should do
229         // distributed log splitting (DLS) vs distributed log replay (DLR).
230         if (!this.shouldSplitWal) {
231           setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
232         } else if (this.distributedLogReplay) {
233           setNextState(ServerCrashState.SERVER_CRASH_PREPARE_LOG_REPLAY);
234         } else {
235           setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
236         }
237         break;
238 
239       case SERVER_CRASH_PROCESS_META:
240         // If we fail processing hbase:meta, yield.
241         if (!processMeta(env)) {
242           throwProcedureYieldException("Waiting on regions-in-transition to clear");
243         }
244         setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
245         break;
246 
247       case SERVER_CRASH_PREPARE_LOG_REPLAY:
248         prepareLogReplay(env, this.regionsOnCrashedServer);
249         setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
250         break;
251 
252       case SERVER_CRASH_SPLIT_LOGS:
253         splitLogs(env);
254         // If DLR, go to FINISH. Otherwise, if DLS, go to SERVER_CRASH_ASSIGN
255         if (this.distributedLogReplay) setNextState(ServerCrashState.SERVER_CRASH_FINISH);
256         else setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
257         break;
258 
259       case SERVER_CRASH_ASSIGN:
260         List<HRegionInfo> regionsToAssign = calcRegionsToAssign(env);
261 
262         // Assign may not be idempotent. SSH used to requeue the SSH if we got an IOE assigning
263         // which is what we are mimicing here but it looks prone to double assignment if assign
264         // fails midway. TODO: Test.
265 
266         // If no regions to assign, skip assign and skip to the finish.
267         boolean regions = regionsToAssign != null && !regionsToAssign.isEmpty();
268         if (regions) {
269           this.regionsAssigned = regionsToAssign;
270           if (!assign(env, regionsToAssign)) {
271             throwProcedureYieldException("Failed assign; will retry");
272           }
273         }
274         if (this.shouldSplitWal && distributedLogReplay) {
275           // Take this route even if there are apparently no regions assigned. This may be our
276           // second time through here; i.e. we assigned and crashed just about here. On second
277           // time through, there will be no regions because we assigned them in the previous step.
278           // Even though no regions, we need to go through here to clean up the DLR zk markers.
279           setNextState(ServerCrashState.SERVER_CRASH_WAIT_ON_ASSIGN);
280         } else {
281           setNextState(ServerCrashState.SERVER_CRASH_FINISH);
282         }
283         break;
284 
285       case SERVER_CRASH_WAIT_ON_ASSIGN:
286         // TODO: The list of regionsAssigned may be more than we actually assigned. See down in
287         // AM #1629 around 'if (regionStates.wasRegionOnDeadServer(encodedName)) {' where where we
288         // will skip assigning a region because it is/was on a dead server. Should never happen!
289         // It was on this server. Worst comes to worst, we'll still wait here till other server is
290         // processed.
291 
292         // If the wait on assign failed, yield -- if we have regions to assign.
293         if (this.regionsAssigned != null && !this.regionsAssigned.isEmpty()) {
294           if (!waitOnAssign(env, this.regionsAssigned)) {
295             throwProcedureYieldException("Waiting on region assign");
296           }
297         }
298         setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
299         break;
300 
301       case SERVER_CRASH_FINISH:
302         LOG.info("Finished processing of crashed " + serverName);
303         services.getServerManager().getDeadServers().finish(serverName);
304         return Flow.NO_MORE_STATE;
305 
306       default:
307         throw new UnsupportedOperationException("unhandled state=" + state);
308       }
309     } catch (ProcedureYieldException e) {
310       LOG.warn("Failed serverName=" + this.serverName + ", state=" + state + "; retry "
311           + e.getMessage());
312       throw e;
313     } catch (IOException e) {
314       LOG.warn("Failed serverName=" + this.serverName + ", state=" + state + "; retry", e);
315     } catch (InterruptedException e) {
316       // TODO: Make executor allow IEs coming up out of execute.
317       LOG.warn("Interrupted serverName=" + this.serverName + ", state=" + state + "; retry", e);
318       Thread.currentThread().interrupt();
319     }
320     return Flow.HAS_MORE_STATE;
321   }
322 
323   /**
324    * Start processing of crashed server. In here we'll just set configs. and return.
325    * @param env
326    * @throws IOException
327    */
328   private void start(final MasterProcedureEnv env) throws IOException {
329     MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
330     // Set recovery mode late. This is what the old ServerShutdownHandler used do.
331     mfs.setLogRecoveryMode();
332     this.distributedLogReplay = mfs.getLogRecoveryMode() == RecoveryMode.LOG_REPLAY;
333   }
334 
335   /**
336    * @param env
337    * @return False if we fail to assign and split logs on meta ('process').
338    * @throws IOException
339    * @throws InterruptedException
340    */
341   private boolean processMeta(final MasterProcedureEnv env)
342   throws IOException {
343     if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
344     MasterServices services = env.getMasterServices();
345     MasterFileSystem mfs = services.getMasterFileSystem();
346     AssignmentManager am = services.getAssignmentManager();
347     HRegionInfo metaHRI = HRegionInfo.FIRST_META_REGIONINFO;
348     if (this.shouldSplitWal) {
349       if (this.distributedLogReplay) {
350         prepareLogReplay(env, META_REGION_SET);
351       } else {
352         // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
353         mfs.splitMetaLog(serverName);
354         am.getRegionStates().logSplit(metaHRI);
355       }
356     }
357 
358     // Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
359     boolean processed = true;
360     boolean shouldAssignMeta = false;
361     AssignmentManager.ServerHostRegion rsCarryingMetaRegion = am.isCarryingMeta(serverName);
362       switch (rsCarryingMetaRegion) {
363         case HOSTING_REGION:
364           LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
365           am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
366           shouldAssignMeta = true;
367           break;
368         case UNKNOWN:
369           if (!services.getMetaTableLocator().isLocationAvailable(services.getZooKeeper())) {
370             // the meta location as per master is null. This could happen in case when meta
371             // assignment in previous run failed, while meta znode has been updated to null.
372             // We should try to assign the meta again.
373             shouldAssignMeta = true;
374             break;
375           }
376           // fall through
377         case NOT_HOSTING_REGION:
378           LOG.info("META has been assigned to otherwhere, skip assigning.");
379           break;
380         default:
381           throw new IOException("Unsupported action in MetaServerShutdownHandler");
382     }
383     if (shouldAssignMeta) {
384       // TODO: May block here if hard time figuring state of meta.
385       verifyAndAssignMetaWithRetries(env);
386       if (this.shouldSplitWal && distributedLogReplay) {
387         int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
388         if (!waitOnRegionToClearRegionsInTransition(am, metaHRI, timeout)) {
389           processed = false;
390         } else {
391           // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
392           mfs.splitMetaLog(serverName);
393         }
394       }
395     }
396     return processed;
397   }
398 
399   /**
400    * @return True if region cleared RIT, else false if we timed out waiting.
401    * @throws InterruptedIOException
402    */
403   private boolean waitOnRegionToClearRegionsInTransition(AssignmentManager am,
404       final HRegionInfo hri, final int timeout)
405   throws InterruptedIOException {
406     try {
407       if (!am.waitOnRegionToClearRegionsInTransition(hri, timeout)) {
408         // Wait here is to avoid log replay hits current dead server and incur a RPC timeout
409         // when replay happens before region assignment completes.
410         LOG.warn("Region " + hri.getEncodedName() + " didn't complete assignment in time");
411         return false;
412       }
413     } catch (InterruptedException ie) {
414       throw new InterruptedIOException("Caught " + ie +
415         " during waitOnRegionToClearRegionsInTransition for " + hri);
416     }
417     return true;
418   }
419 
420   private void prepareLogReplay(final MasterProcedureEnv env, final Set<HRegionInfo> regions)
421   throws IOException {
422     if (LOG.isDebugEnabled()) {
423       LOG.debug("Mark " + size(this.regionsOnCrashedServer) + " regions-in-recovery from " +
424         this.serverName);
425     }
426     MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
427     AssignmentManager am = env.getMasterServices().getAssignmentManager();
428     mfs.prepareLogReplay(this.serverName, regions);
429     am.getRegionStates().logSplit(this.serverName);
430   }
431 
432   private void splitLogs(final MasterProcedureEnv env) throws IOException {
433     if (LOG.isDebugEnabled()) {
434       LOG.debug("Splitting logs from " + serverName + "; region count=" +
435         size(this.regionsOnCrashedServer));
436     }
437     MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
438     AssignmentManager am = env.getMasterServices().getAssignmentManager();
439     // TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
440     mfs.splitLog(this.serverName);
441     am.getRegionStates().logSplit(this.serverName);
442   }
443 
444   static int size(final Collection<HRegionInfo> hris) {
445     return hris == null? 0: hris.size();
446   }
447 
448   /**
449    * Figure out what we need to assign. Should be idempotent.
450    * @param env
451    * @return List of calculated regions to assign; may be empty or null.
452    * @throws IOException
453    */
454   private List<HRegionInfo> calcRegionsToAssign(final MasterProcedureEnv env)
455   throws IOException {
456     AssignmentManager am = env.getMasterServices().getAssignmentManager();
457     List<HRegionInfo> regionsToAssignAggregator = new ArrayList<HRegionInfo>();
458     int replicaCount = env.getMasterConfiguration().getInt(HConstants.META_REPLICAS_NUM,
459       HConstants.DEFAULT_META_REPLICA_NUM);
460     for (int i = 1; i < replicaCount; i++) {
461       HRegionInfo metaHri =
462           RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, i);
463       if (am.isCarryingMetaReplica(this.serverName, metaHri) ==
464           AssignmentManager.ServerHostRegion.HOSTING_REGION) {
465         if (LOG.isDebugEnabled()) {
466           LOG.debug("Reassigning meta replica" + metaHri + " that was on " + this.serverName);
467         }
468         regionsToAssignAggregator.add(metaHri);
469       }
470     }
471     // Clean out anything in regions in transition.
472     List<HRegionInfo> regionsInTransition = am.cleanOutCrashedServerReferences(serverName);
473     if (LOG.isDebugEnabled()) {
474       LOG.debug("Reassigning " + size(this.regionsOnCrashedServer) +
475         " region(s) that " + (serverName == null? "null": serverName)  +
476         " was carrying (and " + regionsInTransition.size() +
477         " regions(s) that were opening on this server)");
478     }
479     regionsToAssignAggregator.addAll(regionsInTransition);
480 
481     // Iterate regions that were on this server and figure which of these we need to reassign
482     if (this.regionsOnCrashedServer != null && !this.regionsOnCrashedServer.isEmpty()) {
483       RegionStates regionStates = am.getRegionStates();
484       for (HRegionInfo hri: this.regionsOnCrashedServer) {
485         if (regionsInTransition.contains(hri)) continue;
486         String encodedName = hri.getEncodedName();
487         Lock lock = am.acquireRegionLock(encodedName);
488         try {
489           RegionState rit = regionStates.getRegionTransitionState(hri);
490           if (processDeadRegion(hri, am)) {
491             ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
492             if (addressFromAM != null && !addressFromAM.equals(this.serverName)) {
493               // If this region is in transition on the dead server, it must be
494               // opening or pending_open, which should have been covered by
495               // AM#cleanOutCrashedServerReferences
496               LOG.info("Skip assigning " + hri.getRegionNameAsString()
497                 + " because opened on " + addressFromAM.getServerName());
498               continue;
499             }
500             if (rit != null) {
501               if (rit.getServerName() != null && !rit.isOnServer(this.serverName)) {
502                 // Skip regions that are in transition on other server
503                 LOG.info("Skip assigning region in transition on other server" + rit);
504                 continue;
505               }
506               LOG.info("Reassigning region " + rit + " and clearing zknode if exists");
507               try {
508                 // This clears out any RIT that might be sticking around.
509                 ZKAssign.deleteNodeFailSilent(env.getMasterServices().getZooKeeper(), hri);
510               } catch (KeeperException e) {
511                 // TODO: FIX!!!! ABORTING SERVER BECAUSE COULDN"T PURGE ZNODE. This is what we
512                 // used to do but that doesn't make it right!!!
513                 env.getMasterServices().abort("Unexpected error deleting RIT " + hri, e);
514                 throw new IOException(e);
515               }
516               regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
517             } else if (regionStates.isRegionInState(
518                 hri, RegionState.State.SPLITTING_NEW, RegionState.State.MERGING_NEW)) {
519               regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
520             }
521             regionsToAssignAggregator.add(hri);
522           // TODO: The below else if is different in branch-1 from master branch.
523           } else if (rit != null) {
524             if ((rit.isPendingCloseOrClosing() || rit.isOffline())
525                 && am.getTableStateManager().isTableState(hri.getTable(),
526                 ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) ||
527                 am.getReplicasToClose().contains(hri)) {
528               // If the table was partially disabled and the RS went down, we should clear the
529               // RIT and remove the node for the region.
530               // The rit that we use may be stale in case the table was in DISABLING state
531               // but though we did assign we will not be clearing the znode in CLOSING state.
532               // Doing this will have no harm. See HBASE-5927
533               regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
534               am.deleteClosingOrClosedNode(hri, rit.getServerName());
535               am.offlineDisabledRegion(hri);
536             } else {
537               LOG.warn("THIS SHOULD NOT HAPPEN: unexpected region in transition "
538                 + rit + " not to be assigned by SSH of server " + serverName);
539             }
540           }
541         } finally {
542           lock.unlock();
543         }
544       }
545     }
546     return regionsToAssignAggregator;
547   }
548 
549   private boolean assign(final MasterProcedureEnv env, final List<HRegionInfo> hris)
550   throws InterruptedIOException {
551     MasterServices masterServices = env.getMasterServices();
552     AssignmentManager am = masterServices.getAssignmentManager();
553     // Determine what type of assignment to do if the dead server already restarted.
554     boolean retainAssignment =
555       (masterServices.getConfiguration().getBoolean("hbase.master.retain.assignment", true) &&
556        masterServices.getServerManager().isServerWithSameHostnamePortOnline(serverName)) ?
557            true : false;
558     try {
559       if (retainAssignment) {
560         Map<HRegionInfo, ServerName> hriServerMap =
561             new HashMap<HRegionInfo, ServerName>(hris.size());
562         for (HRegionInfo hri: hris) {
563           hriServerMap.put(hri, serverName);
564         }
565         LOG.info("Best effort in SSH to retain assignment of " + hris.size()
566           + " regions from the dead server " + serverName);
567         am.assign(hriServerMap);
568       } else {
569         LOG.info("Using round robin in SSH to assign " + hris.size()
570           + " regions from the dead server " + serverName);
571         am.assign(hris);
572       }
573     } catch (InterruptedException ie) {
574       LOG.error("Caught " + ie + " during " + (retainAssignment ? "retaining" : "round-robin")
575         + " assignment");
576       throw (InterruptedIOException)new InterruptedIOException().initCause(ie);
577     } catch (IOException ioe) {
578       LOG.warn("Caught " + ioe + " during region assignment, will retry");
579       return false;
580     }
581     return true;
582   }
583 
584   private boolean waitOnAssign(final MasterProcedureEnv env, final List<HRegionInfo> hris)
585   throws InterruptedIOException {
586     int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
587     for (HRegionInfo hri: hris) {
588       // TODO: Blocks here.
589       if (!waitOnRegionToClearRegionsInTransition(env.getMasterServices().getAssignmentManager(),
590           hri, timeout)) {
591         return false;
592       }
593     }
594     return true;
595   }
596 
597   @Override
598   protected void rollbackState(MasterProcedureEnv env, ServerCrashState state)
599   throws IOException {
600     // Can't rollback.
601     throw new UnsupportedOperationException("unhandled state=" + state);
602   }
603 
604   @Override
605   protected ServerCrashState getState(int stateId) {
606     return ServerCrashState.valueOf(stateId);
607   }
608 
609   @Override
610   protected int getStateId(ServerCrashState state) {
611     return state.getNumber();
612   }
613 
614   @Override
615   protected ServerCrashState getInitialState() {
616     return ServerCrashState.SERVER_CRASH_START;
617   }
618 
619   @Override
620   protected boolean abort(MasterProcedureEnv env) {
621     // TODO
622     return false;
623   }
624 
625   @Override
626   protected boolean acquireLock(final MasterProcedureEnv env) {
627     if (env.waitServerCrashProcessingEnabled(this)) return false;
628     return env.getProcedureQueue().tryAcquireServerExclusiveLock(this, getServerName());
629   }
630 
631   @Override
632   protected void releaseLock(final MasterProcedureEnv env) {
633     env.getProcedureQueue().releaseServerExclusiveLock(this, getServerName());
634   }
635 
636   @Override
637   public void toStringClassDetails(StringBuilder sb) {
638     sb.append(getClass().getSimpleName());
639     sb.append(" serverName=");
640     sb.append(this.serverName);
641     sb.append(", shouldSplitWal=");
642     sb.append(shouldSplitWal);
643     sb.append(", carryingMeta=");
644     sb.append(carryingMeta);
645   }
646 
647   @Override
648   public void serializeStateData(final OutputStream stream) throws IOException {
649     super.serializeStateData(stream);
650 
651     MasterProcedureProtos.ServerCrashStateData.Builder state =
652       MasterProcedureProtos.ServerCrashStateData.newBuilder().
653       setServerName(ProtobufUtil.toServerName(this.serverName)).
654       setDistributedLogReplay(this.distributedLogReplay).
655       setCarryingMeta(this.carryingMeta).
656       setShouldSplitWal(this.shouldSplitWal);
657     if (this.regionsOnCrashedServer != null && !this.regionsOnCrashedServer.isEmpty()) {
658       for (HRegionInfo hri: this.regionsOnCrashedServer) {
659         state.addRegionsOnCrashedServer(HRegionInfo.convert(hri));
660       }
661     }
662     if (this.regionsAssigned != null && !this.regionsAssigned.isEmpty()) {
663       for (HRegionInfo hri: this.regionsAssigned) {
664         state.addRegionsAssigned(HRegionInfo.convert(hri));
665       }
666     }
667     state.build().writeDelimitedTo(stream);
668   }
669 
670   @Override
671   public void deserializeStateData(final InputStream stream) throws IOException {
672     super.deserializeStateData(stream);
673 
674     MasterProcedureProtos.ServerCrashStateData state =
675       MasterProcedureProtos.ServerCrashStateData.parseDelimitedFrom(stream);
676     this.serverName = ProtobufUtil.toServerName(state.getServerName());
677     this.distributedLogReplay = state.hasDistributedLogReplay()?
678       state.getDistributedLogReplay(): false;
679     this.carryingMeta = state.hasCarryingMeta()? state.getCarryingMeta(): false;
680     // shouldSplitWAL has a default over in pb so this invocation will always work.
681     this.shouldSplitWal = state.getShouldSplitWal();
682     int size = state.getRegionsOnCrashedServerCount();
683     if (size > 0) {
684       this.regionsOnCrashedServer = new HashSet<HRegionInfo>(size);
685       for (RegionInfo ri: state.getRegionsOnCrashedServerList()) {
686         this.regionsOnCrashedServer.add(HRegionInfo.convert(ri));
687       }
688     }
689     size = state.getRegionsAssignedCount();
690     if (size > 0) {
691       this.regionsAssigned = new ArrayList<HRegionInfo>(size);
692       for (RegionInfo ri: state.getRegionsOnCrashedServerList()) {
693         this.regionsAssigned.add(HRegionInfo.convert(ri));
694       }
695     }
696   }
697 
698   /**
699    * Process a dead region from a dead RS. Checks if the region is disabled or
700    * disabling or if the region has a partially completed split.
701    * @param hri
702    * @param assignmentManager
703    * @return Returns true if specified region should be assigned, false if not.
704    * @throws IOException
705    */
706   private static boolean processDeadRegion(HRegionInfo hri, AssignmentManager assignmentManager)
707   throws IOException {
708     boolean tablePresent = assignmentManager.getTableStateManager().isTablePresent(hri.getTable());
709     if (!tablePresent) {
710       LOG.info("The table " + hri.getTable() + " was deleted.  Hence not proceeding.");
711       return false;
712     }
713     // If table is not disabled but the region is offlined,
714     boolean disabled = assignmentManager.getTableStateManager().isTableState(hri.getTable(),
715       ZooKeeperProtos.Table.State.DISABLED);
716     if (disabled){
717       LOG.info("The table " + hri.getTable() + " was disabled.  Hence not proceeding.");
718       return false;
719     }
720     if (hri.isOffline() && hri.isSplit()) {
721       // HBASE-7721: Split parent and daughters are inserted into hbase:meta as an atomic operation.
722       // If the meta scanner saw the parent split, then it should see the daughters as assigned
723       // to the dead server. We don't have to do anything.
724       return false;
725     }
726     boolean disabling = assignmentManager.getTableStateManager().isTableState(hri.getTable(),
727       ZooKeeperProtos.Table.State.DISABLING);
728     if (disabling) {
729       LOG.info("The table " + hri.getTable() + " is disabled.  Hence not assigning region" +
730         hri.getEncodedName());
731       return false;
732     }
733     return true;
734   }
735 
736   /**
737    * If hbase:meta is not assigned already, assign.
738    * @throws IOException
739    */
740   private void verifyAndAssignMetaWithRetries(final MasterProcedureEnv env) throws IOException {
741     MasterServices services = env.getMasterServices();
742     int iTimes = services.getConfiguration().getInt(KEY_RETRIES_ON_META, DEFAULT_RETRIES_ON_META);
743     // Just reuse same time as we have for short wait on meta. Adding another config is overkill.
744     long waitTime =
745       services.getConfiguration().getLong(KEY_SHORT_WAIT_ON_META, DEFAULT_SHORT_WAIT_ON_META);
746     int iFlag = 0;
747     while (true) {
748       try {
749         verifyAndAssignMeta(env);
750         break;
751       } catch (KeeperException e) {
752         services.abort("In server shutdown processing, assigning meta", e);
753         throw new IOException("Aborting", e);
754       } catch (Exception e) {
755         if (iFlag >= iTimes) {
756           services.abort("verifyAndAssignMeta failed after" + iTimes + " retries, aborting", e);
757           throw new IOException("Aborting", e);
758         }
759         try {
760           Thread.sleep(waitTime);
761         } catch (InterruptedException e1) {
762           LOG.warn("Interrupted when is the thread sleep", e1);
763           Thread.currentThread().interrupt();
764           throw (InterruptedIOException)new InterruptedIOException().initCause(e1);
765         }
766         iFlag++;
767       }
768     }
769   }
770 
771   /**
772    * If hbase:meta is not assigned already, assign.
773    * @throws InterruptedException
774    * @throws IOException
775    * @throws KeeperException
776    */
777   private void verifyAndAssignMeta(final MasterProcedureEnv env)
778       throws InterruptedException, IOException, KeeperException {
779     MasterServices services = env.getMasterServices();
780     if (!isMetaAssignedQuickTest(env)) {
781       services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
782     } else if (serverName.equals(services.getMetaTableLocator().
783         getMetaRegionLocation(services.getZooKeeper()))) {
784       // hbase:meta seems to be still alive on the server whom master is expiring
785       // and thinks is dying. Let's re-assign the hbase:meta anyway.
786       services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
787     } else {
788       LOG.info("Skip assigning hbase:meta because it is online at "
789           + services.getMetaTableLocator().getMetaRegionLocation(services.getZooKeeper()));
790     }
791   }
792 
793   /**
794    * A quick test that hbase:meta is assigned; blocks for short time only.
795    * @return True if hbase:meta location is available and verified as good.
796    * @throws InterruptedException
797    * @throws IOException
798    */
799   private boolean isMetaAssignedQuickTest(final MasterProcedureEnv env)
800   throws InterruptedException, IOException {
801     ZooKeeperWatcher zkw = env.getMasterServices().getZooKeeper();
802     MetaTableLocator mtl = env.getMasterServices().getMetaTableLocator();
803     boolean metaAssigned = false;
804     // Is hbase:meta location available yet?
805     if (mtl.isLocationAvailable(zkw)) {
806       ClusterConnection connection = env.getMasterServices().getConnection();
807       // Is hbase:meta location good yet?
808       long timeout =
809         env.getMasterConfiguration().getLong(KEY_SHORT_WAIT_ON_META, DEFAULT_SHORT_WAIT_ON_META);
810       if (mtl.verifyMetaRegionLocation(connection, zkw, timeout)) {
811         metaAssigned = true;
812       }
813     }
814     return metaAssigned;
815   }
816 
817   @Override
818   public ServerName getServerName() {
819     return this.serverName;
820   }
821 
822   @Override
823   public boolean hasMetaTableRegion() {
824     return this.carryingMeta;
825   }
826 
827   @Override
828   public ServerOperationType getServerOperationType() {
829     return ServerOperationType.CRASH_HANDLER;
830   }
831 
832   /**
833    * For this procedure, yield at end of each successful flow step so that all crashed servers
834    * can make progress rather than do the default which has each procedure running to completion
835    * before we move to the next. For crashed servers, especially if running with distributed log
836    * replay, we will want all servers to come along; we do not want the scenario where a server is
837    * stuck waiting for regions to online so it can replay edits.
838    */
839   @Override
840   protected boolean isYieldBeforeExecuteFromState(MasterProcedureEnv env, ServerCrashState state) {
841     return true;
842   }
843 
844   @Override
845   protected boolean shouldWaitClientAck(MasterProcedureEnv env) {
846     // The operation is triggered internally on the server
847     // the client does not know about this procedure.
848     return false;
849   }
850 }