001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_CLOSE;
021import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_SPLIT;
022import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_CLOSE_KEY;
023import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY;
024import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME;
025import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT;
026
027import edu.umd.cs.findbugs.annotations.Nullable;
028import java.io.IOException;
029import java.util.concurrent.CompletableFuture;
030import java.util.concurrent.TimeUnit;
031import org.apache.hadoop.hbase.HBaseIOException;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.RegionInfo;
035import org.apache.hadoop.hbase.client.RegionReplicaUtil;
036import org.apache.hadoop.hbase.client.RetriesExhaustedException;
037import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
038import org.apache.hadoop.hbase.master.RegionState.State;
039import org.apache.hadoop.hbase.master.ServerManager;
040import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
041import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
043import org.apache.hadoop.hbase.procedure2.ProcedureFutureUtil;
044import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
045import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
046import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
047import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
048import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
049import org.apache.hadoop.hbase.util.FutureUtils;
050import org.apache.hadoop.hbase.util.RetryCounter;
051import org.apache.yetus.audience.InterfaceAudience;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
056import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionState;
057import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionStateData;
058import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionType;
059import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
060import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
061
062/**
063 * The procedure to deal with the state transition of a region. A region with a TRSP in place is
064 * called RIT, i.e, RegionInTransition.
065 * <p/>
066 * It can be used to assign/unassign/reopen/move a region, and for
067 * {@link #unassign(MasterProcedureEnv, RegionInfo)} and
068 * {@link #reopen(MasterProcedureEnv, RegionInfo)}, you do not need to specify a target server, and
069 * for {@link #assign(MasterProcedureEnv, RegionInfo, ServerName)} and
070 * {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you want to you can provide a
071 * target server. And for {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you do not
072 * specify a targetServer, we will select one randomly.
073 * <p/>
074 * <p/>
075 * The typical state transition for assigning a region is:
076 *
077 * <pre>
078 * GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
079 * </pre>
080 *
081 * Notice that, if there are failures we may go back to the {@code GET_ASSIGN_CANDIDATE} state to
082 * try again.
083 * <p/>
084 * The typical state transition for unassigning a region is:
085 *
086 * <pre>
087 * CLOSE -----> CONFIRM_CLOSED
088 * </pre>
089 *
090 * Here things go a bit different, if there are failures, especially that if there is a server
091 * crash, we will go to the {@code GET_ASSIGN_CANDIDATE} state to bring the region online first, and
092 * then go through the normal way to unassign it.
093 * <p/>
094 * The typical state transition for reopening/moving a region is:
095 *
096 * <pre>
097 * CLOSE -----> CONFIRM_CLOSED -----> GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
098 * </pre>
099 *
100 * The retry logic is the same with the above assign/unassign.
101 * <p/>
102 * Notice that, although we allow specify a target server, it just acts as a candidate, we do not
103 * guarantee that the region will finally be on the target server. If this is important for you, you
104 * should check whether the region is on the target server after the procedure is finished.
105 * </p>
106 * Altenatively, for trying retaining assignments, the
107 * <b>hbase.master.scp.retain.assignment.force</b> option can be used together with
108 * <b>hbase.master.scp.retain.assignment</b>.
109 * <p/>
110 * When you want to schedule a TRSP, please check whether there is still one for this region, and
111 * the check should be under the RegionStateNode lock. We will remove the TRSP from a
112 * RegionStateNode when we are done, see the code in {@code reportTransition} method below. There
113 * could be at most one TRSP for a give region.
114 */
115@InterfaceAudience.Private
116public class TransitRegionStateProcedure
117  extends AbstractStateMachineRegionProcedure<RegionStateTransitionState> {
118
119  private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class);
120
121  private TransitionType type;
122
123  private RegionStateTransitionState initialState;
124
125  private RegionStateTransitionState lastState;
126
127  // the candidate where we want to assign the region to.
128  private ServerName assignCandidate;
129
130  private boolean forceNewPlan;
131
132  private RetryCounter retryCounter;
133
134  private RegionRemoteProcedureBase remoteProc;
135
136  private boolean evictCache;
137
138  private boolean isSplit;
139
140  private RetryCounter forceRetainmentRetryCounter;
141
142  private long forceRetainmentTotalWait;
143
144  private CompletableFuture<Void> future;
145
146  public TransitRegionStateProcedure() {
147  }
148
149  private void setInitialAndLastState() {
150    switch (type) {
151      case ASSIGN:
152        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE;
153        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
154        break;
155      case UNASSIGN:
156        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
157        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED;
158        break;
159      case MOVE:
160      case REOPEN:
161        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
162        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
163        break;
164      default:
165        throw new IllegalArgumentException("Unknown TransitionType: " + type);
166    }
167  }
168
169  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
170    ServerName assignCandidate, boolean forceNewPlan, TransitionType type) {
171    super(env, hri);
172    this.assignCandidate = assignCandidate;
173    this.forceNewPlan = forceNewPlan;
174    this.type = type;
175    setInitialAndLastState();
176
177    // when do reopen TRSP, let the rs know the targetServer so it can keep some info on close
178    if (type == TransitionType.REOPEN) {
179      this.assignCandidate = getRegionStateNode(env).getRegionLocation();
180    }
181    evictCache =
182      env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_CLOSE_KEY, DEFAULT_EVICT_ON_CLOSE);
183    initForceRetainmentRetryCounter(env);
184  }
185
186  private void initForceRetainmentRetryCounter(MasterProcedureEnv env) {
187    if (env.getAssignmentManager().isForceRegionRetainment()) {
188      forceRetainmentRetryCounter =
189        new RetryCounter(env.getAssignmentManager().getForceRegionRetainmentRetries(),
190          env.getAssignmentManager().getForceRegionRetainmentWaitInterval(), TimeUnit.MILLISECONDS);
191      forceRetainmentTotalWait = 0;
192    }
193  }
194
195  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
196    ServerName assignCandidate, boolean forceNewPlan, TransitionType type, boolean isSplit) {
197    this(env, hri, assignCandidate, forceNewPlan, type);
198    this.isSplit = isSplit;
199  }
200
201  @Override
202  public TableOperationType getTableOperationType() {
203    // TODO: maybe we should make another type here, REGION_TRANSITION?
204    return TableOperationType.REGION_EDIT;
205  }
206
207  @Override
208  protected boolean waitInitialized(MasterProcedureEnv env) {
209    if (isCriticalSystemTable()) {
210      return false;
211    }
212    if (TableName.isMetaTableName(getTableName())) {
213      return false;
214    }
215    // First we need meta to be loaded, and second, if meta is not online then we will likely to
216    // fail when updating meta so we wait until it is assigned.
217    AssignmentManager am = env.getAssignmentManager();
218    return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion());
219  }
220
221  private void checkAndWaitForOriginalServer(MasterProcedureEnv env, ServerName lastHost)
222    throws ProcedureSuspendedException {
223    ServerManager serverManager = env.getMasterServices().getServerManager();
224    ServerName newNameForServer = serverManager.findServerWithSameHostnamePortWithLock(lastHost);
225    boolean isOnline = serverManager.createDestinationServersList().contains(newNameForServer);
226
227    if (!isOnline && forceRetainmentRetryCounter.shouldRetry()) {
228      int backoff =
229        Math.toIntExact(forceRetainmentRetryCounter.getBackoffTimeAndIncrementAttempts());
230      forceRetainmentTotalWait += backoff;
231      LOG.info(
232        "Suspending the TRSP PID={} for {}ms because {} is true and previous host {} "
233          + "for region is not yet online.",
234        this.getProcId(), backoff, FORCE_REGION_RETAINMENT, lastHost);
235      setTimeout(backoff);
236      setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
237      throw new ProcedureSuspendedException();
238    }
239    LOG.info(
240      "{} is true. TRSP PID={} waited {}ms for host {} to come back online. "
241        + "Did host come back online? {}",
242      FORCE_REGION_RETAINMENT, this.getProcId(), forceRetainmentTotalWait, lastHost, isOnline);
243    initForceRetainmentRetryCounter(env);
244  }
245
246  private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode)
247    throws ProcedureSuspendedException {
248    boolean retain = false;
249    if (forceNewPlan) {
250      // set the region location to null if forceNewPlan is true
251      regionNode.setRegionLocation(null);
252    } else {
253      if (assignCandidate != null) {
254        retain = assignCandidate.equals(regionNode.getLastHost());
255        regionNode.setRegionLocation(assignCandidate);
256      } else if (regionNode.getLastHost() != null) {
257        retain = true;
258        LOG.info("Setting lastHost {} as the location for region {}", regionNode.getLastHost(),
259          regionNode.getRegionInfo().getEncodedName());
260        regionNode.setRegionLocation(regionNode.getLastHost());
261      }
262      if (
263        regionNode.getRegionLocation() != null
264          && env.getAssignmentManager().isForceRegionRetainment()
265      ) {
266        LOG.warn("{} is set to true. This may delay regions re-assignment "
267          + "upon RegionServers crashes or restarts.", FORCE_REGION_RETAINMENT);
268        checkAndWaitForOriginalServer(env, regionNode.getRegionLocation());
269      }
270    }
271    LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(),
272      forceNewPlan, retain);
273    env.getAssignmentManager().queueAssign(regionNode);
274    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_OPEN);
275    if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
276      throw new ProcedureSuspendedException();
277    }
278  }
279
280  private CompletableFuture<Void> getFuture() {
281    return future;
282  }
283
284  private void setFuture(CompletableFuture<Void> f) {
285    future = f;
286  }
287
288  private void openRegionAfterUpdatingMeta(ServerName loc) {
289    addChildProcedure(new OpenRegionProcedure(this, getRegion(), loc));
290    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED);
291  }
292
293  private void openRegion(MasterProcedureEnv env, RegionStateNode regionNode)
294    throws IOException, ProcedureSuspendedException {
295    ServerName loc = regionNode.getRegionLocation();
296    if (
297      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
298        () -> openRegionAfterUpdatingMeta(loc))
299    ) {
300      return;
301    }
302    if (loc == null || BOGUS_SERVER_NAME.equals(loc)) {
303      LOG.warn("No location specified for {}, jump back to state {} to get one", getRegion(),
304        RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
305      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
306      throw new HBaseIOException("Failed to open region, the location is null or bogus.");
307    }
308    ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
309      env.getAssignmentManager().regionOpening(regionNode), env,
310      () -> openRegionAfterUpdatingMeta(loc));
311  }
312
313  private void regionFailedOpenAfterUpdatingMeta(MasterProcedureEnv env,
314    RegionStateNode regionNode) {
315    setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
316      "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
317    regionNode.unsetProcedure(this);
318  }
319
320  private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
321    throws IOException, ProcedureSuspendedException {
322    if (
323      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
324        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode))
325    ) {
326      return Flow.NO_MORE_STATE;
327    }
328    if (regionNode.isInState(State.OPEN)) {
329      retryCounter = null;
330      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
331        // we are the last state, finish
332        regionNode.unsetProcedure(this);
333        ServerCrashProcedure.updateProgress(env, getParentProcId());
334        return Flow.NO_MORE_STATE;
335      }
336      // It is possible that we arrive here but confirm opened is not the last state, for example,
337      // when merging or splitting a region, we unassign the region from a RS and the RS is crashed,
338      // then there will be recovered edits for this region, we'd better make the region online
339      // again and then unassign it, otherwise we have to fail the merge/split procedure as we may
340      // loss data.
341      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
342      return Flow.HAS_MORE_STATE;
343    }
344
345    int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
346      .incrementAndGetRetries();
347    int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
348    LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());
349
350    if (retries >= maxAttempts) {
351      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
352        env.getAssignmentManager().regionFailedOpen(regionNode, true), env,
353        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode));
354      return Flow.NO_MORE_STATE;
355    }
356
357    // if not giving up, we will not update meta, so the returned CompletableFuture should be a fake
358    // one, which should have been completed already
359    CompletableFuture<Void> future = env.getAssignmentManager().regionFailedOpen(regionNode, false);
360    assert future.isDone();
361    // we failed to assign the region, force a new plan
362    forceNewPlan = true;
363    regionNode.setRegionLocation(null);
364    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
365
366    if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
367      // Throw exception to backoff and retry when failed open too many times
368      throw new HBaseIOException(
369        "Failed confirm OPEN of " + regionNode + " (remote log may yield more detail on why).");
370    } else {
371      // Here we do not throw exception because we want to the region to be online ASAP
372      return Flow.HAS_MORE_STATE;
373    }
374  }
375
376  private void closeRegionAfterUpdatingMeta(MasterProcedureEnv env, RegionStateNode regionNode) {
377    LOG.debug("Close region: isSplit: {}: evictOnSplit: {}: evictOnClose: {}", isSplit,
378      env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_SPLIT_KEY, DEFAULT_EVICT_ON_SPLIT),
379      evictCache);
380    // Splits/Merges are special cases, rather than deciding on the cache eviction behaviour here at
381    // Master, we just need to tell this close is for a split/merge and let RSes decide on the
382    // eviction. See HBASE-28811 for more context.
383    CloseRegionProcedure closeProc = new CloseRegionProcedure(this, getRegion(),
384      regionNode.getRegionLocation(), assignCandidate, isSplit);
385    addChildProcedure(closeProc);
386    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED);
387  }
388
389  private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode)
390    throws IOException, ProcedureSuspendedException {
391    if (
392      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
393        () -> closeRegionAfterUpdatingMeta(env, regionNode))
394    ) {
395      return;
396    }
397    if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) {
398      // this is the normal case
399      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
400        env.getAssignmentManager().regionClosing(regionNode), env,
401        () -> closeRegionAfterUpdatingMeta(env, regionNode));
402    } else {
403      forceNewPlan = true;
404      regionNode.setRegionLocation(null);
405      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
406    }
407  }
408
409  private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
410    throws IOException {
411    if (regionNode.isInState(State.CLOSED)) {
412      retryCounter = null;
413      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
414        // we are the last state, finish
415        regionNode.unsetProcedure(this);
416        return Flow.NO_MORE_STATE;
417      }
418      // This means we need to open the region again, should be a move or reopen
419      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
420      return Flow.HAS_MORE_STATE;
421    }
422    if (regionNode.isInState(State.CLOSING)) {
423      // This is possible, think the target RS crashes and restarts immediately, the close region
424      // operation will return a NotServingRegionException soon, we can only recover after SCP takes
425      // care of this RS. So here we throw an IOException to let upper layer to retry with backoff.
426      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
427      throw new HBaseIOException("Failed to close region");
428    }
429    // abnormally closed, need to reopen it, no matter what is the last state, see the comment in
430    // confirmOpened for more details that why we need to reopen the region first even if we just
431    // want to close it.
432    // The only exception is for non-default replica, where we do not need to deal with recovered
433    // edits. Notice that the region will remain in ABNORMALLY_CLOSED state, the upper layer need to
434    // deal with this state. For non-default replica, this is usually the same with CLOSED.
435    assert regionNode.isInState(State.ABNORMALLY_CLOSED);
436    if (
437      !RegionReplicaUtil.isDefaultReplica(getRegion())
438        && lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED
439    ) {
440      regionNode.unsetProcedure(this);
441      return Flow.NO_MORE_STATE;
442    }
443    retryCounter = null;
444    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
445    return Flow.HAS_MORE_STATE;
446  }
447
448  @Override
449  protected void beforeExec(MasterProcedureEnv env) throws ProcedureSuspendedException {
450    RegionStateNode regionNode =
451      env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
452    if (!regionNode.isLockedBy(this)) {
453      // The wake up action will be called under the lock inside RegionStateNode for implementing
454      // RegionStateNodeLock, so if we call ProcedureUtil.wakeUp where we will acquire the procedure
455      // execution lock directly, it may cause dead lock since in normal case procedure execution
456      // case, we will acquire the procedure execution lock first and then acquire the lock inside
457      // RegionStateNodeLock. This is the reason why we need to schedule the task to a thread pool
458      // and execute asynchronously.
459      regionNode.lock(this,
460        () -> env.getAsyncTaskExecutor().execute(() -> ProcedureFutureUtil.wakeUp(this, env)));
461    }
462  }
463
464  @Override
465  protected void afterExec(MasterProcedureEnv env) {
466    // only release the lock if there is no pending updating meta operation
467    if (future == null) {
468      RegionStateNode regionNode =
469        env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
470      // in beforeExec, we may throw ProcedureSuspendedException which means we do not get the lock,
471      // in this case we should not call unlock
472      if (regionNode.isLockedBy(this)) {
473        regionNode.unlock(this);
474      }
475    }
476  }
477
478  private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
479    return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
480  }
481
482  @Override
483  protected Flow executeFromState(MasterProcedureEnv env, RegionStateTransitionState state)
484    throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
485    RegionStateNode regionNode = getRegionStateNode(env);
486    try {
487      switch (state) {
488        case REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE:
489          // Need to do some sanity check for replica region, if the region does not exist at
490          // master, do not try to assign the replica region, log error and return.
491          if (!RegionReplicaUtil.isDefaultReplica(regionNode.getRegionInfo())) {
492            RegionInfo defaultRI =
493              RegionReplicaUtil.getRegionInfoForDefaultReplica(regionNode.getRegionInfo());
494            if (
495              env.getMasterServices().getAssignmentManager().getRegionStates()
496                .getRegionStateNode(defaultRI) == null
497            ) {
498              LOG.error(
499                "Cannot assign replica region {} because its primary region {} does not exist.",
500                regionNode.getRegionInfo(), defaultRI);
501              regionNode.unsetProcedure(this);
502              return Flow.NO_MORE_STATE;
503            }
504          }
505          queueAssign(env, regionNode);
506          return Flow.HAS_MORE_STATE;
507        case REGION_STATE_TRANSITION_OPEN:
508          openRegion(env, regionNode);
509          return Flow.HAS_MORE_STATE;
510        case REGION_STATE_TRANSITION_CONFIRM_OPENED:
511          return confirmOpened(env, regionNode);
512        case REGION_STATE_TRANSITION_CLOSE:
513          closeRegion(env, regionNode);
514          return Flow.HAS_MORE_STATE;
515        case REGION_STATE_TRANSITION_CONFIRM_CLOSED:
516          return confirmClosed(env, regionNode);
517        default:
518          throw new UnsupportedOperationException("unhandled state=" + state);
519      }
520    } catch (IOException e) {
521      if (retryCounter == null) {
522        retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
523      }
524      long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
525      LOG.warn(
526        "Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed "
527          + "by other Procedure or operator intervention",
528        backoff / 1000, this, regionNode.toShortString(), e);
529      throw suspend(Math.toIntExact(backoff), true);
530    }
531  }
532
533  /**
534   * At end of timeout, wake ourselves up so we run again.
535   */
536  @Override
537  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
538    setState(ProcedureProtos.ProcedureState.RUNNABLE);
539    env.getProcedureScheduler().addFront(this);
540    return false; // 'false' means that this procedure handled the timeout
541  }
542
543  // Should be called with RegionStateNode locked
544  public void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode,
545    ServerName serverName, TransitionCode code, long seqId, long procId) throws IOException {
546    if (remoteProc == null) {
547      LOG.warn(
548        "There is no outstanding remote region procedure for {}, serverName={}, code={},"
549          + " seqId={}, proc={}, should be a retry, ignore",
550        regionNode, serverName, code, seqId, this);
551      return;
552    }
553    // The procId could be -1 if it is from an old region server, we need to deal with it so that we
554    // can do rolling upgraing.
555    if (procId >= 0 && remoteProc.getProcId() != procId) {
556      LOG.warn(
557        "The pid of remote region procedure for {} is {}, the reported pid={}, serverName={},"
558          + " code={}, seqId={}, proc={}, should be a retry, ignore",
559        regionNode, remoteProc.getProcId(), procId, serverName, code, seqId, this);
560      return;
561    }
562    remoteProc.reportTransition(env, regionNode, serverName, code, seqId);
563  }
564
565  // Should be called with RegionStateNode locked
566  public CompletableFuture<Void> serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode,
567    ServerName serverName, boolean forceNewPlan) {
568    this.forceNewPlan = forceNewPlan;
569    if (remoteProc != null) {
570      // this means we are waiting for the sub procedure, so wake it up
571      try {
572        remoteProc.serverCrashed(env, regionNode, serverName);
573      } catch (Exception e) {
574        return FutureUtils.failedFuture(e);
575      }
576      return CompletableFuture.completedFuture(null);
577    } else {
578      if (regionNode.isInState(State.ABNORMALLY_CLOSED)) {
579        // should be a retry, where we have already changed the region state to abnormally closed
580        return CompletableFuture.completedFuture(null);
581      } else {
582        // we are in RUNNING state, just update the region state, and we will process it later.
583        return env.getAssignmentManager().regionClosedAbnormally(regionNode);
584      }
585    }
586  }
587
588  void attachRemoteProc(RegionRemoteProcedureBase proc) {
589    this.remoteProc = proc;
590  }
591
592  void unattachRemoteProc(RegionRemoteProcedureBase proc) {
593    assert this.remoteProc == proc;
594    this.remoteProc = null;
595  }
596
597  // will be called after we finish loading the meta entry for this region.
598  // used to change the state of the region node if we have a sub procedure, as we may not persist
599  // the state to meta yet. See the code in RegionRemoteProcedureBase.execute for more details.
600  void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
601    if (remoteProc != null) {
602      remoteProc.stateLoaded(am, regionNode);
603    }
604  }
605
606  @Override
607  protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
608    throws IOException, InterruptedException {
609    // no rollback
610    throw new UnsupportedOperationException();
611  }
612
613  @Override
614  protected RegionStateTransitionState getState(int stateId) {
615    return RegionStateTransitionState.forNumber(stateId);
616  }
617
618  @Override
619  protected int getStateId(RegionStateTransitionState state) {
620    return state.getNumber();
621  }
622
623  @Override
624  protected RegionStateTransitionState getInitialState() {
625    return initialState;
626  }
627
628  private static TransitionType convert(RegionTransitionType type) {
629    switch (type) {
630      case ASSIGN:
631        return TransitionType.ASSIGN;
632      case UNASSIGN:
633        return TransitionType.UNASSIGN;
634      case MOVE:
635        return TransitionType.MOVE;
636      case REOPEN:
637        return TransitionType.REOPEN;
638      default:
639        throw new IllegalArgumentException("Unknown RegionTransitionType: " + type);
640    }
641  }
642
643  private static RegionTransitionType convert(TransitionType type) {
644    switch (type) {
645      case ASSIGN:
646        return RegionTransitionType.ASSIGN;
647      case UNASSIGN:
648        return RegionTransitionType.UNASSIGN;
649      case MOVE:
650        return RegionTransitionType.MOVE;
651      case REOPEN:
652        return RegionTransitionType.REOPEN;
653      default:
654        throw new IllegalArgumentException("Unknown TransitionType: " + type);
655    }
656  }
657
658  @Override
659  protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
660    super.serializeStateData(serializer);
661    RegionStateTransitionStateData.Builder builder =
662      RegionStateTransitionStateData.newBuilder().setType(convert(type))
663        .setForceNewPlan(forceNewPlan).setEvictCache(evictCache).setIsSplit(isSplit);
664    if (assignCandidate != null) {
665      builder.setAssignCandidate(ProtobufUtil.toServerName(assignCandidate));
666    }
667    serializer.serialize(builder.build());
668  }
669
670  @Override
671  protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
672    super.deserializeStateData(serializer);
673    RegionStateTransitionStateData data =
674      serializer.deserialize(RegionStateTransitionStateData.class);
675    type = convert(data.getType());
676    setInitialAndLastState();
677    forceNewPlan = data.getForceNewPlan();
678    if (data.hasAssignCandidate()) {
679      assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate());
680    }
681    evictCache = data.getEvictCache();
682    isSplit = data.getIsSplit();
683  }
684
685  @Override
686  protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
687    MetricsAssignmentManager metrics = env.getAssignmentManager().getAssignmentManagerMetrics();
688    switch (type) {
689      case ASSIGN:
690        return metrics.getAssignProcMetrics();
691      case UNASSIGN:
692        return metrics.getUnassignProcMetrics();
693      case MOVE:
694        return metrics.getMoveProcMetrics();
695      case REOPEN:
696        return metrics.getReopenProcMetrics();
697      default:
698        throw new IllegalArgumentException("Unknown transition type: " + type);
699    }
700  }
701
702  @Override
703  public void toStringClassDetails(StringBuilder sb) {
704    super.toStringClassDetails(sb);
705    if (initialState == RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE) {
706      sb.append(", ASSIGN");
707    } else if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
708      sb.append(", UNASSIGN");
709    } else {
710      sb.append(", REOPEN/MOVE");
711    }
712  }
713
714  private static TransitRegionStateProcedure setOwner(MasterProcedureEnv env,
715    TransitRegionStateProcedure proc) {
716    proc.setOwner(env.getRequestUser().getShortName());
717    return proc;
718  }
719
720  public enum TransitionType {
721    ASSIGN,
722    UNASSIGN,
723    MOVE,
724    REOPEN
725  }
726
727  // Be careful that, when you call these 4 methods below, you need to manually attach the returned
728  // procedure with the RegionStateNode, otherwise the procedure will quit immediately without doing
729  // anything. See the comment in executeFromState to find out why we need this assumption.
730  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
731    @Nullable ServerName targetServer) {
732    return assign(env, region, false, targetServer);
733  }
734
735  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
736    boolean forceNewPlan, @Nullable ServerName targetServer) {
737    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer, forceNewPlan,
738      TransitionType.ASSIGN));
739  }
740
741  public static TransitRegionStateProcedure unassign(MasterProcedureEnv env, RegionInfo region) {
742    return setOwner(env,
743      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN));
744  }
745
746  public static TransitRegionStateProcedure unassignSplitMerge(MasterProcedureEnv env,
747    RegionInfo region) {
748    return setOwner(env,
749      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN, true));
750  }
751
752  public static TransitRegionStateProcedure reopen(MasterProcedureEnv env, RegionInfo region) {
753    return setOwner(env,
754      new TransitRegionStateProcedure(env, region, null, false, TransitionType.REOPEN));
755  }
756
757  public static TransitRegionStateProcedure move(MasterProcedureEnv env, RegionInfo region,
758    @Nullable ServerName targetServer) {
759    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer,
760      targetServer == null, TransitionType.MOVE));
761  }
762}