001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_CLOSE;
021import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_CLOSE_KEY;
022import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME;
023import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT;
024
025import edu.umd.cs.findbugs.annotations.Nullable;
026import java.io.IOException;
027import java.util.concurrent.CompletableFuture;
028import java.util.concurrent.TimeUnit;
029import org.apache.hadoop.hbase.HBaseIOException;
030import org.apache.hadoop.hbase.ServerName;
031import org.apache.hadoop.hbase.TableName;
032import org.apache.hadoop.hbase.client.RegionInfo;
033import org.apache.hadoop.hbase.client.RegionReplicaUtil;
034import org.apache.hadoop.hbase.client.RetriesExhaustedException;
035import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
036import org.apache.hadoop.hbase.master.RegionState.State;
037import org.apache.hadoop.hbase.master.ServerManager;
038import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
039import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
040import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
041import org.apache.hadoop.hbase.procedure2.Procedure;
042import org.apache.hadoop.hbase.procedure2.ProcedureFutureUtil;
043import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
044import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
045import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
046import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
047import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
048import org.apache.hadoop.hbase.util.FutureUtils;
049import org.apache.hadoop.hbase.util.RetryCounter;
050import org.apache.yetus.audience.InterfaceAudience;
051import org.slf4j.Logger;
052import org.slf4j.LoggerFactory;
053
054import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
055import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionState;
056import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionStateData;
057import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionType;
058import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
059import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
060
061/**
062 * The procedure to deal with the state transition of a region. A region with a TRSP in place is
063 * called RIT, i.e, RegionInTransition.
064 * <p/>
065 * It can be used to assign/unassign/reopen/move a region, and for
066 * {@link #unassign(MasterProcedureEnv, RegionInfo)} and
067 * {@link #reopen(MasterProcedureEnv, RegionInfo)}, you do not need to specify a target server, and
068 * for {@link #assign(MasterProcedureEnv, RegionInfo, ServerName)} and
069 * {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you want to you can provide a
070 * target server. And for {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you do not
071 * specify a targetServer, we will select one randomly.
072 * <p/>
073 * <p/>
074 * The typical state transition for assigning a region is:
075 *
076 * <pre>
077 * GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
078 * </pre>
079 *
080 * Notice that, if there are failures we may go back to the {@code GET_ASSIGN_CANDIDATE} state to
081 * try again.
082 * <p/>
083 * The typical state transition for unassigning a region is:
084 *
085 * <pre>
086 * CLOSE -----> CONFIRM_CLOSED
087 * </pre>
088 *
089 * Here things go a bit different, if there are failures, especially that if there is a server
090 * crash, we will go to the {@code GET_ASSIGN_CANDIDATE} state to bring the region online first, and
091 * then go through the normal way to unassign it.
092 * <p/>
093 * The typical state transition for reopening/moving a region is:
094 *
095 * <pre>
096 * CLOSE -----> CONFIRM_CLOSED -----> GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
097 * </pre>
098 *
099 * The retry logic is the same with the above assign/unassign.
100 * <p/>
101 * Notice that, although we allow specify a target server, it just acts as a candidate, we do not
102 * guarantee that the region will finally be on the target server. If this is important for you, you
103 * should check whether the region is on the target server after the procedure is finished.
104 * </p>
105 * Altenatively, for trying retaining assignments, the
106 * <b>hbase.master.scp.retain.assignment.force</b> option can be used together with
107 * <b>hbase.master.scp.retain.assignment</b>.
108 * <p/>
109 * When you want to schedule a TRSP, please check whether there is still one for this region, and
110 * the check should be under the RegionStateNode lock. We will remove the TRSP from a
111 * RegionStateNode when we are done, see the code in {@code reportTransition} method below. There
112 * could be at most one TRSP for a give region.
113 */
114@InterfaceAudience.Private
115public class TransitRegionStateProcedure
116  extends AbstractStateMachineRegionProcedure<RegionStateTransitionState> {
117
118  private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class);
119
120  private TransitionType type;
121
122  private RegionStateTransitionState initialState;
123
124  private RegionStateTransitionState lastState;
125
126  // the candidate where we want to assign the region to.
127  private ServerName assignCandidate;
128
129  private boolean forceNewPlan;
130
131  private RetryCounter retryCounter;
132
133  private RegionRemoteProcedureBase remoteProc;
134
135  private boolean evictCache;
136
137  private boolean isSplit;
138
139  private RetryCounter forceRetainmentRetryCounter;
140
141  private long forceRetainmentTotalWait;
142
143  private CompletableFuture<Void> future;
144
145  public TransitRegionStateProcedure() {
146  }
147
148  private void setInitialAndLastState() {
149    switch (type) {
150      case ASSIGN:
151        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE;
152        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
153        break;
154      case UNASSIGN:
155        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
156        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED;
157        break;
158      case MOVE:
159      case REOPEN:
160        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
161        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
162        break;
163      default:
164        throw new IllegalArgumentException("Unknown TransitionType: " + type);
165    }
166  }
167
168  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
169    ServerName assignCandidate, boolean forceNewPlan, TransitionType type) {
170    super(env, hri);
171    this.assignCandidate = assignCandidate;
172    this.forceNewPlan = forceNewPlan;
173    this.type = type;
174    setInitialAndLastState();
175
176    // when do reopen TRSP, let the rs know the targetServer so it can keep some info on close
177    if (type == TransitionType.REOPEN) {
178      this.assignCandidate = getRegionStateNode(env).getRegionLocation();
179    }
180    evictCache =
181      env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_CLOSE_KEY, DEFAULT_EVICT_ON_CLOSE);
182    initForceRetainmentRetryCounter(env);
183  }
184
185  private void initForceRetainmentRetryCounter(MasterProcedureEnv env) {
186    if (env.getAssignmentManager().isForceRegionRetainment()) {
187      forceRetainmentRetryCounter =
188        new RetryCounter(env.getAssignmentManager().getForceRegionRetainmentRetries(),
189          env.getAssignmentManager().getForceRegionRetainmentWaitInterval(), TimeUnit.MILLISECONDS);
190      forceRetainmentTotalWait = 0;
191    }
192  }
193
194  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
195    ServerName assignCandidate, boolean forceNewPlan, TransitionType type, boolean isSplit) {
196    this(env, hri, assignCandidate, forceNewPlan, type);
197    this.isSplit = isSplit;
198  }
199
200  @Override
201  public TableOperationType getTableOperationType() {
202    // TODO: maybe we should make another type here, REGION_TRANSITION?
203    return TableOperationType.REGION_EDIT;
204  }
205
206  @Override
207  protected boolean waitInitialized(MasterProcedureEnv env) {
208    if (TableName.isMetaTableName(getTableName())) {
209      return false;
210    }
211    // First we need meta to be loaded, and second, if meta is not online then we will likely to
212    // fail when updating meta so we wait until it is assigned.
213    AssignmentManager am = env.getAssignmentManager();
214    return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion());
215  }
216
217  private void checkAndWaitForOriginalServer(MasterProcedureEnv env, ServerName lastHost)
218    throws ProcedureSuspendedException {
219    ServerManager serverManager = env.getMasterServices().getServerManager();
220    ServerName newNameForServer = serverManager.findServerWithSameHostnamePortWithLock(lastHost);
221    boolean isOnline = serverManager.createDestinationServersList().contains(newNameForServer);
222
223    if (!isOnline && forceRetainmentRetryCounter.shouldRetry()) {
224      int backoff =
225        Math.toIntExact(forceRetainmentRetryCounter.getBackoffTimeAndIncrementAttempts());
226      forceRetainmentTotalWait += backoff;
227      LOG.info(
228        "Suspending the TRSP PID={} for {}ms because {} is true and previous host {} "
229          + "for region is not yet online.",
230        this.getProcId(), backoff, FORCE_REGION_RETAINMENT, lastHost);
231      setTimeout(backoff);
232      setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
233      throw new ProcedureSuspendedException();
234    }
235    LOG.info(
236      "{} is true. TRSP PID={} waited {}ms for host {} to come back online. "
237        + "Did host come back online? {}",
238      FORCE_REGION_RETAINMENT, this.getProcId(), forceRetainmentTotalWait, lastHost, isOnline);
239    initForceRetainmentRetryCounter(env);
240  }
241
242  private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode)
243    throws ProcedureSuspendedException {
244    boolean retain = false;
245    if (forceNewPlan) {
246      // set the region location to null if forceNewPlan is true
247      regionNode.setRegionLocation(null);
248    } else {
249      if (assignCandidate != null) {
250        retain = assignCandidate.equals(regionNode.getLastHost());
251        regionNode.setRegionLocation(assignCandidate);
252      } else if (regionNode.getLastHost() != null) {
253        retain = true;
254        LOG.info("Setting lastHost {} as the location for region {}", regionNode.getLastHost(),
255          regionNode.getRegionInfo().getEncodedName());
256        regionNode.setRegionLocation(regionNode.getLastHost());
257      }
258      if (
259        regionNode.getRegionLocation() != null
260          && env.getAssignmentManager().isForceRegionRetainment()
261      ) {
262        LOG.warn("{} is set to true. This may delay regions re-assignment "
263          + "upon RegionServers crashes or restarts.", FORCE_REGION_RETAINMENT);
264        checkAndWaitForOriginalServer(env, regionNode.getRegionLocation());
265      }
266    }
267    LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(),
268      forceNewPlan, retain);
269    env.getAssignmentManager().queueAssign(regionNode);
270    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_OPEN);
271    if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
272      throw new ProcedureSuspendedException();
273    }
274  }
275
276  private CompletableFuture<Void> getFuture() {
277    return future;
278  }
279
280  private void setFuture(CompletableFuture<Void> f) {
281    future = f;
282  }
283
284  private void openRegionAfterUpdatingMeta(ServerName loc) {
285    addChildProcedure(new OpenRegionProcedure(this, getRegion(), loc));
286    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED);
287  }
288
289  private void openRegion(MasterProcedureEnv env, RegionStateNode regionNode)
290    throws IOException, ProcedureSuspendedException {
291    ServerName loc = regionNode.getRegionLocation();
292    if (
293      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
294        () -> openRegionAfterUpdatingMeta(loc))
295    ) {
296      return;
297    }
298    if (loc == null || BOGUS_SERVER_NAME.equals(loc)) {
299      LOG.warn("No location specified for {}, jump back to state {} to get one", getRegion(),
300        RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
301      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
302      throw new HBaseIOException("Failed to open region, the location is null or bogus.");
303    }
304    ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
305      env.getAssignmentManager().regionOpening(regionNode), env,
306      () -> openRegionAfterUpdatingMeta(loc));
307  }
308
309  private void regionFailedOpenAfterUpdatingMeta(MasterProcedureEnv env,
310    RegionStateNode regionNode) {
311    setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
312      "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
313    regionNode.unsetProcedure(this);
314  }
315
316  private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
317    throws IOException, ProcedureSuspendedException {
318    if (
319      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
320        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode))
321    ) {
322      return Flow.NO_MORE_STATE;
323    }
324    if (regionNode.isInState(State.OPEN)) {
325      retryCounter = null;
326      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
327        // we are the last state, finish
328        regionNode.unsetProcedure(this);
329        ServerCrashProcedure.updateProgress(env, getParentProcId());
330        return Flow.NO_MORE_STATE;
331      }
332      // It is possible that we arrive here but confirm opened is not the last state, for example,
333      // when merging or splitting a region, we unassign the region from a RS and the RS is crashed,
334      // then there will be recovered edits for this region, we'd better make the region online
335      // again and then unassign it, otherwise we have to fail the merge/split procedure as we may
336      // loss data.
337      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
338      return Flow.HAS_MORE_STATE;
339    }
340
341    int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
342      .incrementAndGetRetries();
343    int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
344    LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());
345
346    if (retries >= maxAttempts) {
347      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
348        env.getAssignmentManager().regionFailedOpen(regionNode, true), env,
349        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode));
350      return Flow.NO_MORE_STATE;
351    }
352
353    // if not giving up, we will not update meta, so the returned CompletableFuture should be a fake
354    // one, which should have been completed already
355    CompletableFuture<Void> future = env.getAssignmentManager().regionFailedOpen(regionNode, false);
356    assert future.isDone();
357    // we failed to assign the region, force a new plan
358    forceNewPlan = true;
359    regionNode.setRegionLocation(null);
360    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
361
362    if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
363      // Throw exception to backoff and retry when failed open too many times
364      throw new HBaseIOException(
365        "Failed confirm OPEN of " + regionNode + " (remote log may yield more detail on why).");
366    } else {
367      // Here we do not throw exception because we want to the region to be online ASAP
368      return Flow.HAS_MORE_STATE;
369    }
370  }
371
372  private void closeRegionAfterUpdatingMeta(RegionStateNode regionNode) {
373    CloseRegionProcedure closeProc = isSplit
374      ? new CloseRegionProcedure(this, getRegion(), regionNode.getRegionLocation(), assignCandidate,
375        true)
376      : new CloseRegionProcedure(this, getRegion(), regionNode.getRegionLocation(), assignCandidate,
377        evictCache);
378    addChildProcedure(closeProc);
379    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED);
380  }
381
382  private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode)
383    throws IOException, ProcedureSuspendedException {
384    if (
385      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
386        () -> closeRegionAfterUpdatingMeta(regionNode))
387    ) {
388      return;
389    }
390    if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) {
391      // this is the normal case
392      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
393        env.getAssignmentManager().regionClosing(regionNode), env,
394        () -> closeRegionAfterUpdatingMeta(regionNode));
395    } else {
396      forceNewPlan = true;
397      regionNode.setRegionLocation(null);
398      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
399    }
400  }
401
402  private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
403    throws IOException {
404    if (regionNode.isInState(State.CLOSED)) {
405      retryCounter = null;
406      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
407        // we are the last state, finish
408        regionNode.unsetProcedure(this);
409        return Flow.NO_MORE_STATE;
410      }
411      // This means we need to open the region again, should be a move or reopen
412      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
413      return Flow.HAS_MORE_STATE;
414    }
415    if (regionNode.isInState(State.CLOSING)) {
416      // This is possible, think the target RS crashes and restarts immediately, the close region
417      // operation will return a NotServingRegionException soon, we can only recover after SCP takes
418      // care of this RS. So here we throw an IOException to let upper layer to retry with backoff.
419      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
420      throw new HBaseIOException("Failed to close region");
421    }
422    // abnormally closed, need to reopen it, no matter what is the last state, see the comment in
423    // confirmOpened for more details that why we need to reopen the region first even if we just
424    // want to close it.
425    // The only exception is for non-default replica, where we do not need to deal with recovered
426    // edits. Notice that the region will remain in ABNORMALLY_CLOSED state, the upper layer need to
427    // deal with this state. For non-default replica, this is usually the same with CLOSED.
428    assert regionNode.isInState(State.ABNORMALLY_CLOSED);
429    if (
430      !RegionReplicaUtil.isDefaultReplica(getRegion())
431        && lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED
432    ) {
433      regionNode.unsetProcedure(this);
434      return Flow.NO_MORE_STATE;
435    }
436    retryCounter = null;
437    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
438    return Flow.HAS_MORE_STATE;
439  }
440
441  // Override to lock RegionStateNode
442  @SuppressWarnings("rawtypes")
443  @Override
444  protected Procedure[] execute(MasterProcedureEnv env)
445    throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
446    RegionStateNode regionNode =
447      env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
448    if (!regionNode.isLockedBy(this)) {
449      regionNode.lock(this, () -> ProcedureFutureUtil.wakeUp(this, env));
450    }
451    try {
452      return super.execute(env);
453    } finally {
454      if (future == null) {
455        // release the lock if there is no pending updating meta operation
456        regionNode.unlock(this);
457      }
458    }
459  }
460
461  private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
462    return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
463  }
464
465  @Override
466  protected Flow executeFromState(MasterProcedureEnv env, RegionStateTransitionState state)
467    throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
468    RegionStateNode regionNode = getRegionStateNode(env);
469    try {
470      switch (state) {
471        case REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE:
472          // Need to do some sanity check for replica region, if the region does not exist at
473          // master, do not try to assign the replica region, log error and return.
474          if (!RegionReplicaUtil.isDefaultReplica(regionNode.getRegionInfo())) {
475            RegionInfo defaultRI =
476              RegionReplicaUtil.getRegionInfoForDefaultReplica(regionNode.getRegionInfo());
477            if (
478              env.getMasterServices().getAssignmentManager().getRegionStates()
479                .getRegionStateNode(defaultRI) == null
480            ) {
481              LOG.error(
482                "Cannot assign replica region {} because its primary region {} does not exist.",
483                regionNode.getRegionInfo(), defaultRI);
484              regionNode.unsetProcedure(this);
485              return Flow.NO_MORE_STATE;
486            }
487          }
488          queueAssign(env, regionNode);
489          return Flow.HAS_MORE_STATE;
490        case REGION_STATE_TRANSITION_OPEN:
491          openRegion(env, regionNode);
492          return Flow.HAS_MORE_STATE;
493        case REGION_STATE_TRANSITION_CONFIRM_OPENED:
494          return confirmOpened(env, regionNode);
495        case REGION_STATE_TRANSITION_CLOSE:
496          closeRegion(env, regionNode);
497          return Flow.HAS_MORE_STATE;
498        case REGION_STATE_TRANSITION_CONFIRM_CLOSED:
499          return confirmClosed(env, regionNode);
500        default:
501          throw new UnsupportedOperationException("unhandled state=" + state);
502      }
503    } catch (IOException e) {
504      if (retryCounter == null) {
505        retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
506      }
507      long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
508      LOG.warn(
509        "Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed "
510          + "by other Procedure or operator intervention",
511        backoff / 1000, this, regionNode.toShortString(), e);
512      throw suspend(Math.toIntExact(backoff), true);
513    }
514  }
515
516  /**
517   * At end of timeout, wake ourselves up so we run again.
518   */
519  @Override
520  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
521    setState(ProcedureProtos.ProcedureState.RUNNABLE);
522    env.getProcedureScheduler().addFront(this);
523    return false; // 'false' means that this procedure handled the timeout
524  }
525
526  // Should be called with RegionStateNode locked
527  public void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode,
528    ServerName serverName, TransitionCode code, long seqId, long procId) throws IOException {
529    if (remoteProc == null) {
530      LOG.warn(
531        "There is no outstanding remote region procedure for {}, serverName={}, code={},"
532          + " seqId={}, proc={}, should be a retry, ignore",
533        regionNode, serverName, code, seqId, this);
534      return;
535    }
536    // The procId could be -1 if it is from an old region server, we need to deal with it so that we
537    // can do rolling upgraing.
538    if (procId >= 0 && remoteProc.getProcId() != procId) {
539      LOG.warn(
540        "The pid of remote region procedure for {} is {}, the reported pid={}, serverName={},"
541          + " code={}, seqId={}, proc={}, should be a retry, ignore",
542        regionNode, remoteProc.getProcId(), procId, serverName, code, seqId, this);
543      return;
544    }
545    remoteProc.reportTransition(env, regionNode, serverName, code, seqId);
546  }
547
548  // Should be called with RegionStateNode locked
549  public CompletableFuture<Void> serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode,
550    ServerName serverName, boolean forceNewPlan) {
551    this.forceNewPlan = forceNewPlan;
552    if (remoteProc != null) {
553      // this means we are waiting for the sub procedure, so wake it up
554      try {
555        remoteProc.serverCrashed(env, regionNode, serverName);
556      } catch (Exception e) {
557        return FutureUtils.failedFuture(e);
558      }
559      return CompletableFuture.completedFuture(null);
560    } else {
561      if (regionNode.isInState(State.ABNORMALLY_CLOSED)) {
562        // should be a retry, where we have already changed the region state to abnormally closed
563        return CompletableFuture.completedFuture(null);
564      } else {
565        // we are in RUNNING state, just update the region state, and we will process it later.
566        return env.getAssignmentManager().regionClosedAbnormally(regionNode);
567      }
568    }
569  }
570
571  void attachRemoteProc(RegionRemoteProcedureBase proc) {
572    this.remoteProc = proc;
573  }
574
575  void unattachRemoteProc(RegionRemoteProcedureBase proc) {
576    assert this.remoteProc == proc;
577    this.remoteProc = null;
578  }
579
580  // will be called after we finish loading the meta entry for this region.
581  // used to change the state of the region node if we have a sub procedure, as we may not persist
582  // the state to meta yet. See the code in RegionRemoteProcedureBase.execute for more details.
583  void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
584    if (remoteProc != null) {
585      remoteProc.stateLoaded(am, regionNode);
586    }
587  }
588
589  @Override
590  protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
591    throws IOException, InterruptedException {
592    // no rollback
593    throw new UnsupportedOperationException();
594  }
595
596  @Override
597  protected RegionStateTransitionState getState(int stateId) {
598    return RegionStateTransitionState.forNumber(stateId);
599  }
600
601  @Override
602  protected int getStateId(RegionStateTransitionState state) {
603    return state.getNumber();
604  }
605
606  @Override
607  protected RegionStateTransitionState getInitialState() {
608    return initialState;
609  }
610
611  private static TransitionType convert(RegionTransitionType type) {
612    switch (type) {
613      case ASSIGN:
614        return TransitionType.ASSIGN;
615      case UNASSIGN:
616        return TransitionType.UNASSIGN;
617      case MOVE:
618        return TransitionType.MOVE;
619      case REOPEN:
620        return TransitionType.REOPEN;
621      default:
622        throw new IllegalArgumentException("Unknown RegionTransitionType: " + type);
623    }
624  }
625
626  private static RegionTransitionType convert(TransitionType type) {
627    switch (type) {
628      case ASSIGN:
629        return RegionTransitionType.ASSIGN;
630      case UNASSIGN:
631        return RegionTransitionType.UNASSIGN;
632      case MOVE:
633        return RegionTransitionType.MOVE;
634      case REOPEN:
635        return RegionTransitionType.REOPEN;
636      default:
637        throw new IllegalArgumentException("Unknown TransitionType: " + type);
638    }
639  }
640
641  @Override
642  protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
643    super.serializeStateData(serializer);
644    RegionStateTransitionStateData.Builder builder =
645      RegionStateTransitionStateData.newBuilder().setType(convert(type))
646        .setForceNewPlan(forceNewPlan).setEvictCache(evictCache).setIsSplit(isSplit);
647    if (assignCandidate != null) {
648      builder.setAssignCandidate(ProtobufUtil.toServerName(assignCandidate));
649    }
650    serializer.serialize(builder.build());
651  }
652
653  @Override
654  protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
655    super.deserializeStateData(serializer);
656    RegionStateTransitionStateData data =
657      serializer.deserialize(RegionStateTransitionStateData.class);
658    type = convert(data.getType());
659    setInitialAndLastState();
660    forceNewPlan = data.getForceNewPlan();
661    if (data.hasAssignCandidate()) {
662      assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate());
663    }
664    evictCache = data.getEvictCache();
665    isSplit = data.getIsSplit();
666  }
667
668  @Override
669  protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
670    MetricsAssignmentManager metrics = env.getAssignmentManager().getAssignmentManagerMetrics();
671    switch (type) {
672      case ASSIGN:
673        return metrics.getAssignProcMetrics();
674      case UNASSIGN:
675        return metrics.getUnassignProcMetrics();
676      case MOVE:
677        return metrics.getMoveProcMetrics();
678      case REOPEN:
679        return metrics.getReopenProcMetrics();
680      default:
681        throw new IllegalArgumentException("Unknown transition type: " + type);
682    }
683  }
684
685  @Override
686  public void toStringClassDetails(StringBuilder sb) {
687    super.toStringClassDetails(sb);
688    if (initialState == RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE) {
689      sb.append(", ASSIGN");
690    } else if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
691      sb.append(", UNASSIGN");
692    } else {
693      sb.append(", REOPEN/MOVE");
694    }
695  }
696
697  private static TransitRegionStateProcedure setOwner(MasterProcedureEnv env,
698    TransitRegionStateProcedure proc) {
699    proc.setOwner(env.getRequestUser().getShortName());
700    return proc;
701  }
702
703  public enum TransitionType {
704    ASSIGN,
705    UNASSIGN,
706    MOVE,
707    REOPEN
708  }
709
710  // Be careful that, when you call these 4 methods below, you need to manually attach the returned
711  // procedure with the RegionStateNode, otherwise the procedure will quit immediately without doing
712  // anything. See the comment in executeFromState to find out why we need this assumption.
713  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
714    @Nullable ServerName targetServer) {
715    return assign(env, region, false, targetServer);
716  }
717
718  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
719    boolean forceNewPlan, @Nullable ServerName targetServer) {
720    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer, forceNewPlan,
721      TransitionType.ASSIGN));
722  }
723
724  public static TransitRegionStateProcedure unassign(MasterProcedureEnv env, RegionInfo region) {
725    return setOwner(env,
726      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN));
727  }
728
729  public static TransitRegionStateProcedure unassignSplitMerge(MasterProcedureEnv env,
730    RegionInfo region) {
731    return setOwner(env,
732      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN, true));
733  }
734
735  public static TransitRegionStateProcedure reopen(MasterProcedureEnv env, RegionInfo region) {
736    return setOwner(env,
737      new TransitRegionStateProcedure(env, region, null, false, TransitionType.REOPEN));
738  }
739
740  public static TransitRegionStateProcedure move(MasterProcedureEnv env, RegionInfo region,
741    @Nullable ServerName targetServer) {
742    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer,
743      targetServer == null, TransitionType.MOVE));
744  }
745}