001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_CLOSE;
021import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_SPLIT;
022import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_CLOSE_KEY;
023import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY;
024import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME;
025import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT;
026
027import edu.umd.cs.findbugs.annotations.Nullable;
028import java.io.IOException;
029import java.util.concurrent.CompletableFuture;
030import java.util.concurrent.TimeUnit;
031import org.apache.hadoop.hbase.HBaseIOException;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.RegionInfo;
035import org.apache.hadoop.hbase.client.RegionReplicaUtil;
036import org.apache.hadoop.hbase.client.RetriesExhaustedException;
037import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
038import org.apache.hadoop.hbase.master.RegionState.State;
039import org.apache.hadoop.hbase.master.ServerManager;
040import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
041import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
043import org.apache.hadoop.hbase.procedure2.ProcedureFutureUtil;
044import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
045import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
046import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
047import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
048import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
049import org.apache.hadoop.hbase.util.FutureUtils;
050import org.apache.hadoop.hbase.util.RetryCounter;
051import org.apache.yetus.audience.InterfaceAudience;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
056import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionState;
057import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionStateData;
058import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionType;
059import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
060import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
061
062/**
063 * The procedure to deal with the state transition of a region. A region with a TRSP in place is
064 * called RIT, i.e, RegionInTransition.
065 * <p/>
066 * It can be used to assign/unassign/reopen/move a region, and for
067 * {@link #unassign(MasterProcedureEnv, RegionInfo)} and
068 * {@link #reopen(MasterProcedureEnv, RegionInfo)}, you do not need to specify a target server, and
069 * for {@link #assign(MasterProcedureEnv, RegionInfo, ServerName)} and
070 * {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you want to you can provide a
071 * target server. And for {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you do not
072 * specify a targetServer, we will select one randomly.
073 * <p/>
074 * <p/>
075 * The typical state transition for assigning a region is:
076 *
077 * <pre>
078 * GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
079 * </pre>
080 *
081 * Notice that, if there are failures we may go back to the {@code GET_ASSIGN_CANDIDATE} state to
082 * try again.
083 * <p/>
084 * The typical state transition for unassigning a region is:
085 *
086 * <pre>
087 * CLOSE -----> CONFIRM_CLOSED
088 * </pre>
089 *
090 * Here things go a bit different, if there are failures, especially that if there is a server
091 * crash, we will go to the {@code GET_ASSIGN_CANDIDATE} state to bring the region online first, and
092 * then go through the normal way to unassign it.
093 * <p/>
094 * The typical state transition for reopening/moving a region is:
095 *
096 * <pre>
097 * CLOSE -----> CONFIRM_CLOSED -----> GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
098 * </pre>
099 *
100 * The retry logic is the same with the above assign/unassign.
101 * <p/>
102 * Notice that, although we allow specify a target server, it just acts as a candidate, we do not
103 * guarantee that the region will finally be on the target server. If this is important for you, you
104 * should check whether the region is on the target server after the procedure is finished.
105 * </p>
106 * Altenatively, for trying retaining assignments, the
107 * <b>hbase.master.scp.retain.assignment.force</b> option can be used together with
108 * <b>hbase.master.scp.retain.assignment</b>.
109 * <p/>
110 * When you want to schedule a TRSP, please check whether there is still one for this region, and
111 * the check should be under the RegionStateNode lock. We will remove the TRSP from a
112 * RegionStateNode when we are done, see the code in {@code reportTransition} method below. There
113 * could be at most one TRSP for a give region.
114 */
115@InterfaceAudience.Private
116public class TransitRegionStateProcedure
117  extends AbstractStateMachineRegionProcedure<RegionStateTransitionState> {
118
119  private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class);
120
121  private TransitionType type;
122
123  private RegionStateTransitionState initialState;
124
125  private RegionStateTransitionState lastState;
126
127  // the candidate where we want to assign the region to.
128  private ServerName assignCandidate;
129
130  private boolean forceNewPlan;
131
132  private RetryCounter retryCounter;
133
134  private RegionRemoteProcedureBase remoteProc;
135
136  private boolean evictCache;
137
138  private boolean isSplit;
139
140  private RetryCounter forceRetainmentRetryCounter;
141
142  private long forceRetainmentTotalWait;
143
144  private CompletableFuture<Void> future;
145
146  public TransitRegionStateProcedure() {
147  }
148
149  private void setInitialAndLastState() {
150    switch (type) {
151      case ASSIGN:
152        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE;
153        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
154        break;
155      case UNASSIGN:
156        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
157        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED;
158        break;
159      case MOVE:
160      case REOPEN:
161        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
162        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
163        break;
164      default:
165        throw new IllegalArgumentException("Unknown TransitionType: " + type);
166    }
167  }
168
169  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
170    ServerName assignCandidate, boolean forceNewPlan, TransitionType type) {
171    super(env, hri);
172    this.assignCandidate = assignCandidate;
173    this.forceNewPlan = forceNewPlan;
174    this.type = type;
175    setInitialAndLastState();
176
177    // when do reopen TRSP, let the rs know the targetServer so it can keep some info on close
178    if (type == TransitionType.REOPEN) {
179      this.assignCandidate = getRegionStateNode(env).getRegionLocation();
180    }
181    evictCache =
182      env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_CLOSE_KEY, DEFAULT_EVICT_ON_CLOSE);
183    initForceRetainmentRetryCounter(env);
184  }
185
186  private void initForceRetainmentRetryCounter(MasterProcedureEnv env) {
187    if (env.getAssignmentManager().isForceRegionRetainment()) {
188      forceRetainmentRetryCounter =
189        new RetryCounter(env.getAssignmentManager().getForceRegionRetainmentRetries(),
190          env.getAssignmentManager().getForceRegionRetainmentWaitInterval(), TimeUnit.MILLISECONDS);
191      forceRetainmentTotalWait = 0;
192    }
193  }
194
195  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
196    ServerName assignCandidate, boolean forceNewPlan, TransitionType type, boolean isSplit) {
197    this(env, hri, assignCandidate, forceNewPlan, type);
198    this.isSplit = isSplit;
199  }
200
201  @Override
202  public TableOperationType getTableOperationType() {
203    // TODO: maybe we should make another type here, REGION_TRANSITION?
204    return TableOperationType.REGION_EDIT;
205  }
206
207  @Override
208  protected boolean waitInitialized(MasterProcedureEnv env) {
209    if (TableName.isMetaTableName(getTableName())) {
210      return false;
211    }
212    // First we need meta to be loaded, and second, if meta is not online then we will likely to
213    // fail when updating meta so we wait until it is assigned.
214    AssignmentManager am = env.getAssignmentManager();
215    return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion());
216  }
217
218  private void checkAndWaitForOriginalServer(MasterProcedureEnv env, ServerName lastHost)
219    throws ProcedureSuspendedException {
220    ServerManager serverManager = env.getMasterServices().getServerManager();
221    ServerName newNameForServer = serverManager.findServerWithSameHostnamePortWithLock(lastHost);
222    boolean isOnline = serverManager.createDestinationServersList().contains(newNameForServer);
223
224    if (!isOnline && forceRetainmentRetryCounter.shouldRetry()) {
225      int backoff =
226        Math.toIntExact(forceRetainmentRetryCounter.getBackoffTimeAndIncrementAttempts());
227      forceRetainmentTotalWait += backoff;
228      LOG.info(
229        "Suspending the TRSP PID={} for {}ms because {} is true and previous host {} "
230          + "for region is not yet online.",
231        this.getProcId(), backoff, FORCE_REGION_RETAINMENT, lastHost);
232      setTimeout(backoff);
233      setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
234      throw new ProcedureSuspendedException();
235    }
236    LOG.info(
237      "{} is true. TRSP PID={} waited {}ms for host {} to come back online. "
238        + "Did host come back online? {}",
239      FORCE_REGION_RETAINMENT, this.getProcId(), forceRetainmentTotalWait, lastHost, isOnline);
240    initForceRetainmentRetryCounter(env);
241  }
242
243  private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode)
244    throws ProcedureSuspendedException {
245    boolean retain = false;
246    if (forceNewPlan) {
247      // set the region location to null if forceNewPlan is true
248      regionNode.setRegionLocation(null);
249    } else {
250      if (assignCandidate != null) {
251        retain = assignCandidate.equals(regionNode.getLastHost());
252        regionNode.setRegionLocation(assignCandidate);
253      } else if (regionNode.getLastHost() != null) {
254        retain = true;
255        LOG.info("Setting lastHost {} as the location for region {}", regionNode.getLastHost(),
256          regionNode.getRegionInfo().getEncodedName());
257        regionNode.setRegionLocation(regionNode.getLastHost());
258      }
259      if (
260        regionNode.getRegionLocation() != null
261          && env.getAssignmentManager().isForceRegionRetainment()
262      ) {
263        LOG.warn("{} is set to true. This may delay regions re-assignment "
264          + "upon RegionServers crashes or restarts.", FORCE_REGION_RETAINMENT);
265        checkAndWaitForOriginalServer(env, regionNode.getRegionLocation());
266      }
267    }
268    LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(),
269      forceNewPlan, retain);
270    env.getAssignmentManager().queueAssign(regionNode);
271    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_OPEN);
272    if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
273      throw new ProcedureSuspendedException();
274    }
275  }
276
277  private CompletableFuture<Void> getFuture() {
278    return future;
279  }
280
281  private void setFuture(CompletableFuture<Void> f) {
282    future = f;
283  }
284
285  private void openRegionAfterUpdatingMeta(ServerName loc) {
286    addChildProcedure(new OpenRegionProcedure(this, getRegion(), loc));
287    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED);
288  }
289
290  private void openRegion(MasterProcedureEnv env, RegionStateNode regionNode)
291    throws IOException, ProcedureSuspendedException {
292    ServerName loc = regionNode.getRegionLocation();
293    if (
294      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
295        () -> openRegionAfterUpdatingMeta(loc))
296    ) {
297      return;
298    }
299    if (loc == null || BOGUS_SERVER_NAME.equals(loc)) {
300      LOG.warn("No location specified for {}, jump back to state {} to get one", getRegion(),
301        RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
302      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
303      throw new HBaseIOException("Failed to open region, the location is null or bogus.");
304    }
305    ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
306      env.getAssignmentManager().regionOpening(regionNode), env,
307      () -> openRegionAfterUpdatingMeta(loc));
308  }
309
310  private void regionFailedOpenAfterUpdatingMeta(MasterProcedureEnv env,
311    RegionStateNode regionNode) {
312    setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
313      "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
314    regionNode.unsetProcedure(this);
315  }
316
317  private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
318    throws IOException, ProcedureSuspendedException {
319    if (
320      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
321        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode))
322    ) {
323      return Flow.NO_MORE_STATE;
324    }
325    if (regionNode.isInState(State.OPEN)) {
326      retryCounter = null;
327      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
328        // we are the last state, finish
329        regionNode.unsetProcedure(this);
330        ServerCrashProcedure.updateProgress(env, getParentProcId());
331        return Flow.NO_MORE_STATE;
332      }
333      // It is possible that we arrive here but confirm opened is not the last state, for example,
334      // when merging or splitting a region, we unassign the region from a RS and the RS is crashed,
335      // then there will be recovered edits for this region, we'd better make the region online
336      // again and then unassign it, otherwise we have to fail the merge/split procedure as we may
337      // loss data.
338      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
339      return Flow.HAS_MORE_STATE;
340    }
341
342    int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
343      .incrementAndGetRetries();
344    int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
345    LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());
346
347    if (retries >= maxAttempts) {
348      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
349        env.getAssignmentManager().regionFailedOpen(regionNode, true), env,
350        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode));
351      return Flow.NO_MORE_STATE;
352    }
353
354    // if not giving up, we will not update meta, so the returned CompletableFuture should be a fake
355    // one, which should have been completed already
356    CompletableFuture<Void> future = env.getAssignmentManager().regionFailedOpen(regionNode, false);
357    assert future.isDone();
358    // we failed to assign the region, force a new plan
359    forceNewPlan = true;
360    regionNode.setRegionLocation(null);
361    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
362
363    if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
364      // Throw exception to backoff and retry when failed open too many times
365      throw new HBaseIOException(
366        "Failed confirm OPEN of " + regionNode + " (remote log may yield more detail on why).");
367    } else {
368      // Here we do not throw exception because we want to the region to be online ASAP
369      return Flow.HAS_MORE_STATE;
370    }
371  }
372
373  private void closeRegionAfterUpdatingMeta(MasterProcedureEnv env, RegionStateNode regionNode) {
374    LOG.debug("Close region: isSplit: {}: evictOnSplit: {}: evictOnClose: {}", isSplit,
375      env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_SPLIT_KEY, DEFAULT_EVICT_ON_SPLIT),
376      evictCache);
377    // Splits/Merges are special cases, rather than deciding on the cache eviction behaviour here at
378    // Master, we just need to tell this close is for a split/merge and let RSes decide on the
379    // eviction. See HBASE-28811 for more context.
380    CloseRegionProcedure closeProc = new CloseRegionProcedure(this, getRegion(),
381      regionNode.getRegionLocation(), assignCandidate, isSplit);
382    addChildProcedure(closeProc);
383    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED);
384  }
385
386  private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode)
387    throws IOException, ProcedureSuspendedException {
388    if (
389      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
390        () -> closeRegionAfterUpdatingMeta(env, regionNode))
391    ) {
392      return;
393    }
394    if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) {
395      // this is the normal case
396      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
397        env.getAssignmentManager().regionClosing(regionNode), env,
398        () -> closeRegionAfterUpdatingMeta(env, regionNode));
399    } else {
400      forceNewPlan = true;
401      regionNode.setRegionLocation(null);
402      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
403    }
404  }
405
406  private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
407    throws IOException {
408    if (regionNode.isInState(State.CLOSED)) {
409      retryCounter = null;
410      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
411        // we are the last state, finish
412        regionNode.unsetProcedure(this);
413        return Flow.NO_MORE_STATE;
414      }
415      // This means we need to open the region again, should be a move or reopen
416      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
417      return Flow.HAS_MORE_STATE;
418    }
419    if (regionNode.isInState(State.CLOSING)) {
420      // This is possible, think the target RS crashes and restarts immediately, the close region
421      // operation will return a NotServingRegionException soon, we can only recover after SCP takes
422      // care of this RS. So here we throw an IOException to let upper layer to retry with backoff.
423      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
424      throw new HBaseIOException("Failed to close region");
425    }
426    // abnormally closed, need to reopen it, no matter what is the last state, see the comment in
427    // confirmOpened for more details that why we need to reopen the region first even if we just
428    // want to close it.
429    // The only exception is for non-default replica, where we do not need to deal with recovered
430    // edits. Notice that the region will remain in ABNORMALLY_CLOSED state, the upper layer need to
431    // deal with this state. For non-default replica, this is usually the same with CLOSED.
432    assert regionNode.isInState(State.ABNORMALLY_CLOSED);
433    if (
434      !RegionReplicaUtil.isDefaultReplica(getRegion())
435        && lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED
436    ) {
437      regionNode.unsetProcedure(this);
438      return Flow.NO_MORE_STATE;
439    }
440    retryCounter = null;
441    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
442    return Flow.HAS_MORE_STATE;
443  }
444
445  @Override
446  protected void beforeExec(MasterProcedureEnv env) throws ProcedureSuspendedException {
447    RegionStateNode regionNode =
448      env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
449    if (!regionNode.isLockedBy(this)) {
450      // The wake up action will be called under the lock inside RegionStateNode for implementing
451      // RegionStateNodeLock, so if we call ProcedureUtil.wakeUp where we will acquire the procedure
452      // execution lock directly, it may cause dead lock since in normal case procedure execution
453      // case, we will acquire the procedure execution lock first and then acquire the lock inside
454      // RegionStateNodeLock. This is the reason why we need to schedule the task to a thread pool
455      // and execute asynchronously.
456      regionNode.lock(this,
457        () -> env.getAsyncTaskExecutor().execute(() -> ProcedureFutureUtil.wakeUp(this, env)));
458    }
459  }
460
461  @Override
462  protected void afterExec(MasterProcedureEnv env) {
463    // only release the lock if there is no pending updating meta operation
464    if (future == null) {
465      RegionStateNode regionNode =
466        env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
467      // in beforeExec, we may throw ProcedureSuspendedException which means we do not get the lock,
468      // in this case we should not call unlock
469      if (regionNode.isLockedBy(this)) {
470        regionNode.unlock(this);
471      }
472    }
473  }
474
475  private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
476    return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
477  }
478
479  @Override
480  protected Flow executeFromState(MasterProcedureEnv env, RegionStateTransitionState state)
481    throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
482    RegionStateNode regionNode = getRegionStateNode(env);
483    try {
484      switch (state) {
485        case REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE:
486          // Need to do some sanity check for replica region, if the region does not exist at
487          // master, do not try to assign the replica region, log error and return.
488          if (!RegionReplicaUtil.isDefaultReplica(regionNode.getRegionInfo())) {
489            RegionInfo defaultRI =
490              RegionReplicaUtil.getRegionInfoForDefaultReplica(regionNode.getRegionInfo());
491            if (
492              env.getMasterServices().getAssignmentManager().getRegionStates()
493                .getRegionStateNode(defaultRI) == null
494            ) {
495              LOG.error(
496                "Cannot assign replica region {} because its primary region {} does not exist.",
497                regionNode.getRegionInfo(), defaultRI);
498              regionNode.unsetProcedure(this);
499              return Flow.NO_MORE_STATE;
500            }
501          }
502          queueAssign(env, regionNode);
503          return Flow.HAS_MORE_STATE;
504        case REGION_STATE_TRANSITION_OPEN:
505          openRegion(env, regionNode);
506          return Flow.HAS_MORE_STATE;
507        case REGION_STATE_TRANSITION_CONFIRM_OPENED:
508          return confirmOpened(env, regionNode);
509        case REGION_STATE_TRANSITION_CLOSE:
510          closeRegion(env, regionNode);
511          return Flow.HAS_MORE_STATE;
512        case REGION_STATE_TRANSITION_CONFIRM_CLOSED:
513          return confirmClosed(env, regionNode);
514        default:
515          throw new UnsupportedOperationException("unhandled state=" + state);
516      }
517    } catch (IOException e) {
518      if (retryCounter == null) {
519        retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
520      }
521      long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
522      LOG.warn(
523        "Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed "
524          + "by other Procedure or operator intervention",
525        backoff / 1000, this, regionNode.toShortString(), e);
526      throw suspend(Math.toIntExact(backoff), true);
527    }
528  }
529
530  /**
531   * At end of timeout, wake ourselves up so we run again.
532   */
533  @Override
534  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
535    setState(ProcedureProtos.ProcedureState.RUNNABLE);
536    env.getProcedureScheduler().addFront(this);
537    return false; // 'false' means that this procedure handled the timeout
538  }
539
540  // Should be called with RegionStateNode locked
541  public void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode,
542    ServerName serverName, TransitionCode code, long seqId, long procId) throws IOException {
543    if (remoteProc == null) {
544      LOG.warn(
545        "There is no outstanding remote region procedure for {}, serverName={}, code={},"
546          + " seqId={}, proc={}, should be a retry, ignore",
547        regionNode, serverName, code, seqId, this);
548      return;
549    }
550    // The procId could be -1 if it is from an old region server, we need to deal with it so that we
551    // can do rolling upgraing.
552    if (procId >= 0 && remoteProc.getProcId() != procId) {
553      LOG.warn(
554        "The pid of remote region procedure for {} is {}, the reported pid={}, serverName={},"
555          + " code={}, seqId={}, proc={}, should be a retry, ignore",
556        regionNode, remoteProc.getProcId(), procId, serverName, code, seqId, this);
557      return;
558    }
559    remoteProc.reportTransition(env, regionNode, serverName, code, seqId);
560  }
561
562  // Should be called with RegionStateNode locked
563  public CompletableFuture<Void> serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode,
564    ServerName serverName, boolean forceNewPlan) {
565    this.forceNewPlan = forceNewPlan;
566    if (remoteProc != null) {
567      // this means we are waiting for the sub procedure, so wake it up
568      try {
569        remoteProc.serverCrashed(env, regionNode, serverName);
570      } catch (Exception e) {
571        return FutureUtils.failedFuture(e);
572      }
573      return CompletableFuture.completedFuture(null);
574    } else {
575      if (regionNode.isInState(State.ABNORMALLY_CLOSED)) {
576        // should be a retry, where we have already changed the region state to abnormally closed
577        return CompletableFuture.completedFuture(null);
578      } else {
579        // we are in RUNNING state, just update the region state, and we will process it later.
580        return env.getAssignmentManager().regionClosedAbnormally(regionNode);
581      }
582    }
583  }
584
585  void attachRemoteProc(RegionRemoteProcedureBase proc) {
586    this.remoteProc = proc;
587  }
588
589  void unattachRemoteProc(RegionRemoteProcedureBase proc) {
590    assert this.remoteProc == proc;
591    this.remoteProc = null;
592  }
593
594  // will be called after we finish loading the meta entry for this region.
595  // used to change the state of the region node if we have a sub procedure, as we may not persist
596  // the state to meta yet. See the code in RegionRemoteProcedureBase.execute for more details.
597  void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
598    if (remoteProc != null) {
599      remoteProc.stateLoaded(am, regionNode);
600    }
601  }
602
603  @Override
604  protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
605    throws IOException, InterruptedException {
606    // no rollback
607    throw new UnsupportedOperationException();
608  }
609
610  @Override
611  protected RegionStateTransitionState getState(int stateId) {
612    return RegionStateTransitionState.forNumber(stateId);
613  }
614
615  @Override
616  protected int getStateId(RegionStateTransitionState state) {
617    return state.getNumber();
618  }
619
620  @Override
621  protected RegionStateTransitionState getInitialState() {
622    return initialState;
623  }
624
625  private static TransitionType convert(RegionTransitionType type) {
626    switch (type) {
627      case ASSIGN:
628        return TransitionType.ASSIGN;
629      case UNASSIGN:
630        return TransitionType.UNASSIGN;
631      case MOVE:
632        return TransitionType.MOVE;
633      case REOPEN:
634        return TransitionType.REOPEN;
635      default:
636        throw new IllegalArgumentException("Unknown RegionTransitionType: " + type);
637    }
638  }
639
640  private static RegionTransitionType convert(TransitionType type) {
641    switch (type) {
642      case ASSIGN:
643        return RegionTransitionType.ASSIGN;
644      case UNASSIGN:
645        return RegionTransitionType.UNASSIGN;
646      case MOVE:
647        return RegionTransitionType.MOVE;
648      case REOPEN:
649        return RegionTransitionType.REOPEN;
650      default:
651        throw new IllegalArgumentException("Unknown TransitionType: " + type);
652    }
653  }
654
655  @Override
656  protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
657    super.serializeStateData(serializer);
658    RegionStateTransitionStateData.Builder builder =
659      RegionStateTransitionStateData.newBuilder().setType(convert(type))
660        .setForceNewPlan(forceNewPlan).setEvictCache(evictCache).setIsSplit(isSplit);
661    if (assignCandidate != null) {
662      builder.setAssignCandidate(ProtobufUtil.toServerName(assignCandidate));
663    }
664    serializer.serialize(builder.build());
665  }
666
667  @Override
668  protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
669    super.deserializeStateData(serializer);
670    RegionStateTransitionStateData data =
671      serializer.deserialize(RegionStateTransitionStateData.class);
672    type = convert(data.getType());
673    setInitialAndLastState();
674    forceNewPlan = data.getForceNewPlan();
675    if (data.hasAssignCandidate()) {
676      assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate());
677    }
678    evictCache = data.getEvictCache();
679    isSplit = data.getIsSplit();
680  }
681
682  @Override
683  protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
684    MetricsAssignmentManager metrics = env.getAssignmentManager().getAssignmentManagerMetrics();
685    switch (type) {
686      case ASSIGN:
687        return metrics.getAssignProcMetrics();
688      case UNASSIGN:
689        return metrics.getUnassignProcMetrics();
690      case MOVE:
691        return metrics.getMoveProcMetrics();
692      case REOPEN:
693        return metrics.getReopenProcMetrics();
694      default:
695        throw new IllegalArgumentException("Unknown transition type: " + type);
696    }
697  }
698
699  @Override
700  public void toStringClassDetails(StringBuilder sb) {
701    super.toStringClassDetails(sb);
702    if (initialState == RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE) {
703      sb.append(", ASSIGN");
704    } else if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
705      sb.append(", UNASSIGN");
706    } else {
707      sb.append(", REOPEN/MOVE");
708    }
709  }
710
711  private static TransitRegionStateProcedure setOwner(MasterProcedureEnv env,
712    TransitRegionStateProcedure proc) {
713    proc.setOwner(env.getRequestUser().getShortName());
714    return proc;
715  }
716
717  public enum TransitionType {
718    ASSIGN,
719    UNASSIGN,
720    MOVE,
721    REOPEN
722  }
723
724  // Be careful that, when you call these 4 methods below, you need to manually attach the returned
725  // procedure with the RegionStateNode, otherwise the procedure will quit immediately without doing
726  // anything. See the comment in executeFromState to find out why we need this assumption.
727  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
728    @Nullable ServerName targetServer) {
729    return assign(env, region, false, targetServer);
730  }
731
732  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
733    boolean forceNewPlan, @Nullable ServerName targetServer) {
734    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer, forceNewPlan,
735      TransitionType.ASSIGN));
736  }
737
738  public static TransitRegionStateProcedure unassign(MasterProcedureEnv env, RegionInfo region) {
739    return setOwner(env,
740      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN));
741  }
742
743  public static TransitRegionStateProcedure unassignSplitMerge(MasterProcedureEnv env,
744    RegionInfo region) {
745    return setOwner(env,
746      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN, true));
747  }
748
749  public static TransitRegionStateProcedure reopen(MasterProcedureEnv env, RegionInfo region) {
750    return setOwner(env,
751      new TransitRegionStateProcedure(env, region, null, false, TransitionType.REOPEN));
752  }
753
754  public static TransitRegionStateProcedure move(MasterProcedureEnv env, RegionInfo region,
755    @Nullable ServerName targetServer) {
756    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer,
757      targetServer == null, TransitionType.MOVE));
758  }
759}