001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.Optional;
022import org.apache.hadoop.hbase.HConstants;
023import org.apache.hadoop.hbase.ServerName;
024import org.apache.hadoop.hbase.TableName;
025import org.apache.hadoop.hbase.client.RegionInfo;
026import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
027import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
028import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface;
029import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException;
030import org.apache.hadoop.hbase.procedure2.Procedure;
031import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
032import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
033import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
034import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
035import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
036import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
037import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
038import org.apache.hadoop.hbase.procedure2.RemoteProcedureException;
039import org.apache.hadoop.hbase.util.RetryCounter;
040import org.apache.yetus.audience.InterfaceAudience;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
044import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseState;
045import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData;
046import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
047import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
048
049/**
050 * The base class for the remote procedures used to open/close a region.
051 * <p/>
052 * Notice that here we do not care about the result of the remote call, if the remote call is
053 * finished, either succeeded or not, we will always finish the procedure. The parent procedure
054 * should take care of the result and try to reschedule if the result is not good.
055 */
056@InterfaceAudience.Private
057public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedureEnv>
058    implements TableProcedureInterface, RemoteProcedure<MasterProcedureEnv, ServerName> {
059
060  private static final Logger LOG = LoggerFactory.getLogger(RegionRemoteProcedureBase.class);
061
062  protected RegionInfo region;
063
064  protected ServerName targetServer;
065
066  private RegionRemoteProcedureBaseState state =
067    RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH;
068
069  private TransitionCode transitionCode;
070
071  private long seqId;
072
073  private RetryCounter retryCounter;
074
075  protected RegionRemoteProcedureBase() {
076  }
077
078  protected RegionRemoteProcedureBase(TransitRegionStateProcedure parent, RegionInfo region,
079      ServerName targetServer) {
080    this.region = region;
081    this.targetServer = targetServer;
082    parent.attachRemoteProc(this);
083  }
084
085  @Override
086  public Optional<RemoteProcedureDispatcher.RemoteOperation> remoteCallBuild(MasterProcedureEnv env,
087      ServerName remote) {
088    // REPORT_SUCCEED means that this remote open/close request already executed in RegionServer.
089    // So return empty operation and RSProcedureDispatcher no need to send it again.
090    if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
091      return Optional.empty();
092    }
093    return Optional.of(newRemoteOperation());
094  }
095
096  protected abstract RemoteProcedureDispatcher.RemoteOperation newRemoteOperation();
097
098  @Override
099  public void remoteOperationCompleted(MasterProcedureEnv env) {
100    // should not be called since we use reportRegionStateTransition to report the result
101    throw new UnsupportedOperationException();
102  }
103
104  @Override
105  public void remoteOperationFailed(MasterProcedureEnv env, RemoteProcedureException error) {
106    // should not be called since we use reportRegionStateTransition to report the result
107    throw new UnsupportedOperationException();
108  }
109
110  private RegionStateNode getRegionNode(MasterProcedureEnv env) {
111    return env.getAssignmentManager().getRegionStates().getRegionStateNode(region);
112  }
113
114  @Override
115  public void remoteCallFailed(MasterProcedureEnv env, ServerName remote, IOException exception) {
116    RegionStateNode regionNode = getRegionNode(env);
117    regionNode.lock();
118    try {
119      if (!env.getMasterServices().getServerManager().isServerOnline(remote)) {
120        // the SCP will interrupt us, give up
121        LOG.debug("{} for region {}, targetServer {} is dead, SCP will interrupt us, give up", this,
122          regionNode, remote);
123        return;
124      }
125      if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) {
126        // not sure how can this happen but anyway let's add a check here to avoid waking the wrong
127        // procedure...
128        LOG.warn("{} for region {}, targetServer={} has already been woken up, ignore", this,
129          regionNode, remote);
130        return;
131      }
132      LOG.warn("The remote operation {} for region {} to server {} failed", this, regionNode,
133        remote, exception);
134      // It is OK to not persist the state here, as we do not need to change the region state if the
135      // remote call is failed. If the master crashed before we actually execute the procedure and
136      // persist the new state, it is fine to retry on the same target server again.
137      state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH_FAIL;
138      regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
139    } finally {
140      regionNode.unlock();
141    }
142  }
143
144  @Override
145  public TableName getTableName() {
146    return region.getTable();
147  }
148
149  @Override
150  protected boolean waitInitialized(MasterProcedureEnv env) {
151    if (TableName.isMetaTableName(getTableName())) {
152      return false;
153    }
154    // First we need meta to be loaded, and second, if meta is not online then we will likely to
155    // fail when updating meta so we wait until it is assigned.
156    AssignmentManager am = env.getAssignmentManager();
157    return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, region);
158  }
159
160  @Override
161  protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException {
162    throw new UnsupportedOperationException();
163  }
164
165  @Override
166  protected boolean abort(MasterProcedureEnv env) {
167    return false;
168  }
169
170  // do some checks to see if the report is valid
171  protected abstract void checkTransition(RegionStateNode regionNode, TransitionCode transitionCode,
172      long seqId) throws UnexpectedStateException;
173
174  // change the in memory state of the regionNode, but do not update meta.
175  protected abstract void updateTransitionWithoutPersistingToMeta(MasterProcedureEnv env,
176      RegionStateNode regionNode, TransitionCode transitionCode, long seqId) throws IOException;
177
178  // A bit strange but the procedure store will throw RuntimeException if we can not persist the
179  // state, so upper layer should take care of this...
180  private void persistAndWake(MasterProcedureEnv env, RegionStateNode regionNode) {
181    env.getMasterServices().getMasterProcedureExecutor().getStore().update(this);
182    regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
183  }
184
185  // should be called with RegionStateNode locked, to avoid race with the execute method below
186  void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName,
187      TransitionCode transitionCode, long seqId) throws IOException {
188    if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) {
189      // should be a retry
190      return;
191    }
192    if (!targetServer.equals(serverName)) {
193      throw new UnexpectedStateException("Received report from " + serverName + ", expected " +
194        targetServer + ", " + regionNode + ", proc=" + this);
195    }
196    checkTransition(regionNode, transitionCode, seqId);
197    // this state means we have received the report from RS, does not mean the result is fine, as we
198    // may received a FAILED_OPEN.
199    this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED;
200    this.transitionCode = transitionCode;
201    this.seqId = seqId;
202    // Persist the transition code and openSeqNum(if provided).
203    // We should not update the hbase:meta directly as this may cause races when master restarts,
204    // as the old active master may incorrectly report back to RS and cause the new master to hang
205    // on a OpenRegionProcedure forever. See HBASE-22060 and HBASE-22074 for more details.
206    boolean succ = false;
207    try {
208      persistAndWake(env, regionNode);
209      succ = true;
210    } finally {
211      if (!succ) {
212        this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH;
213        this.transitionCode = null;
214        this.seqId = HConstants.NO_SEQNUM;
215      }
216    }
217    try {
218      updateTransitionWithoutPersistingToMeta(env, regionNode, transitionCode, seqId);
219    } catch (IOException e) {
220      throw new AssertionError("should not happen", e);
221    }
222  }
223
224  void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName) {
225    if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH) {
226      // should be a retry
227      return;
228    }
229    RegionRemoteProcedureBaseState oldState = state;
230    // it is possible that the state is in REGION_REMOTE_PROCEDURE_SERVER_CRASH, think of this
231    // sequence
232    // 1. region is open on the target server and the above reportTransition call is succeeded
233    // 2. before we are woken up and update the meta, the target server crashes, and then we arrive
234    // here
235    this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH;
236    boolean succ = false;
237    try {
238      persistAndWake(env, regionNode);
239      succ = true;
240    } finally {
241      if (!succ) {
242        this.state = oldState;
243      }
244    }
245  }
246
247  protected abstract void restoreSucceedState(AssignmentManager am, RegionStateNode regionNode,
248      long seqId) throws IOException;
249
250  void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
251    if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
252      try {
253        restoreSucceedState(am, regionNode, seqId);
254      } catch (IOException e) {
255        // should not happen as we are just restoring the state
256        throw new AssertionError(e);
257      }
258    }
259  }
260
261  private TransitRegionStateProcedure getParent(MasterProcedureEnv env) {
262    return (TransitRegionStateProcedure) env.getMasterServices().getMasterProcedureExecutor()
263      .getProcedure(getParentProcId());
264  }
265
266  private void unattach(MasterProcedureEnv env) {
267    getParent(env).unattachRemoteProc(this);
268  }
269
270  @Override
271  protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
272      throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
273    RegionStateNode regionNode = getRegionNode(env);
274    regionNode.lock();
275    try {
276      switch (state) {
277        case REGION_REMOTE_PROCEDURE_DISPATCH: {
278          // The code which wakes us up also needs to lock the RSN so here we do not need to
279          // synchronize
280          // on the event.
281          ProcedureEvent<?> event = regionNode.getProcedureEvent();
282          try {
283            env.getRemoteDispatcher().addOperationToNode(targetServer, this);
284          } catch (FailedRemoteDispatchException e) {
285            LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " +
286              "because the server is alread dead, give up and mark the procedure as complete, " +
287              "the parent procedure will take care of this.", this, region, targetServer, e);
288            unattach(env);
289            return null;
290          }
291          event.suspend();
292          event.suspendIfNotReady(this);
293          throw new ProcedureSuspendedException();
294        }
295        case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED:
296          env.getAssignmentManager().persistToMeta(regionNode);
297          unattach(env);
298          return null;
299        case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL:
300          // the remote call is failed so we do not need to change the region state, just return.
301          unattach(env);
302          return null;
303        case REGION_REMOTE_PROCEDURE_SERVER_CRASH:
304          env.getAssignmentManager().regionClosedAbnormally(regionNode);
305          unattach(env);
306          return null;
307        default:
308          throw new IllegalStateException("Unknown state: " + state);
309      }
310    } catch (IOException e) {
311      if (retryCounter == null) {
312        retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
313      }
314      long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
315      LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e);
316      setTimeout(Math.toIntExact(backoff));
317      setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
318      skipPersistence();
319      throw new ProcedureSuspendedException();
320    } finally {
321      regionNode.unlock();
322    }
323  }
324
325  @Override
326  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
327    setState(ProcedureProtos.ProcedureState.RUNNABLE);
328    env.getProcedureScheduler().addFront(this);
329    return false; // 'false' means that this procedure handled the timeout
330  }
331
332  @Override
333  public boolean storeInDispatchedQueue() {
334    return false;
335  }
336
337  @Override
338  protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
339    RegionRemoteProcedureBaseStateData.Builder builder =
340      RegionRemoteProcedureBaseStateData.newBuilder().setRegion(ProtobufUtil.toRegionInfo(region))
341        .setTargetServer(ProtobufUtil.toServerName(targetServer)).setState(state);
342    if (transitionCode != null) {
343      builder.setTransitionCode(transitionCode);
344      builder.setSeqId(seqId);
345    }
346    serializer.serialize(builder.build());
347  }
348
349  @Override
350  protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
351    RegionRemoteProcedureBaseStateData data =
352      serializer.deserialize(RegionRemoteProcedureBaseStateData.class);
353    region = ProtobufUtil.toRegionInfo(data.getRegion());
354    targetServer = ProtobufUtil.toServerName(data.getTargetServer());
355    state = data.getState();
356    if (data.hasTransitionCode()) {
357      transitionCode = data.getTransitionCode();
358      seqId = data.getSeqId();
359    }
360  }
361
362  @Override
363  protected void afterReplay(MasterProcedureEnv env) {
364    getParent(env).attachRemoteProc(this);
365  }
366
367  @Override public String getProcName() {
368    return getClass().getSimpleName() + " " + region.getEncodedName();
369  }
370
371  @Override protected void toStringClassDetails(StringBuilder builder) {
372    builder.append(getProcName());
373    if (targetServer != null) {
374      builder.append(", server=");
375      builder.append(this.targetServer);
376    }
377    if (this.retryCounter != null) {
378      builder.append(", retry=");
379      builder.append(this.retryCounter);
380    }
381  }
382}