001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import java.io.IOException;
021import java.util.Optional;
022
023import org.apache.hadoop.hbase.HConstants;
024import org.apache.hadoop.hbase.ServerName;
025import org.apache.hadoop.hbase.TableName;
026import org.apache.hadoop.hbase.client.RegionInfo;
027import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
028import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
029import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface;
030import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException;
031import org.apache.hadoop.hbase.procedure2.Procedure;
032import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
033import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
034import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
035import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
036import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
037import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
038import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
039import org.apache.hadoop.hbase.procedure2.RemoteProcedureException;
040import org.apache.hadoop.hbase.util.RetryCounter;
041import org.apache.yetus.audience.InterfaceAudience;
042import org.slf4j.Logger;
043import org.slf4j.LoggerFactory;
044
045import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
046import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseState;
047import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData;
048import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
049import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
050
051/**
052 * The base class for the remote procedures used to open/close a region.
053 * <p/>
054 * Notice that here we do not care about the result of the remote call, if the remote call is
055 * finished, either succeeded or not, we will always finish the procedure. The parent procedure
056 * should take care of the result and try to reschedule if the result is not good.
057 */
058@InterfaceAudience.Private
059public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedureEnv>
060    implements TableProcedureInterface, RemoteProcedure<MasterProcedureEnv, ServerName> {
061
062  private static final Logger LOG = LoggerFactory.getLogger(RegionRemoteProcedureBase.class);
063
064  protected RegionInfo region;
065
066  protected ServerName targetServer;
067
068  private RegionRemoteProcedureBaseState state =
069    RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH;
070
071  private TransitionCode transitionCode;
072
073  private long seqId;
074
075  private RetryCounter retryCounter;
076
077  protected RegionRemoteProcedureBase() {
078  }
079
080  protected RegionRemoteProcedureBase(TransitRegionStateProcedure parent, RegionInfo region,
081      ServerName targetServer) {
082    this.region = region;
083    this.targetServer = targetServer;
084    parent.attachRemoteProc(this);
085  }
086
087  @Override
088  public Optional<RemoteProcedureDispatcher.RemoteOperation> remoteCallBuild(MasterProcedureEnv env,
089      ServerName remote) {
090    // REPORT_SUCCEED means that this remote open/close request already executed in RegionServer.
091    // So return empty operation and RSProcedureDispatcher no need to send it again.
092    if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
093      return Optional.empty();
094    }
095    return Optional.of(newRemoteOperation());
096  }
097
098  protected abstract RemoteProcedureDispatcher.RemoteOperation newRemoteOperation();
099
100  @Override
101  public void remoteOperationCompleted(MasterProcedureEnv env) {
102    // should not be called since we use reportRegionStateTransition to report the result
103    throw new UnsupportedOperationException();
104  }
105
106  @Override
107  public void remoteOperationFailed(MasterProcedureEnv env, RemoteProcedureException error) {
108    // should not be called since we use reportRegionStateTransition to report the result
109    throw new UnsupportedOperationException();
110  }
111
112  private RegionStateNode getRegionNode(MasterProcedureEnv env) {
113    return env.getAssignmentManager().getRegionStates().getRegionStateNode(region);
114  }
115
116  @Override
117  public void remoteCallFailed(MasterProcedureEnv env, ServerName remote, IOException exception) {
118    RegionStateNode regionNode = getRegionNode(env);
119    regionNode.lock();
120    try {
121      if (!env.getMasterServices().getServerManager().isServerOnline(remote)) {
122        // the SCP will interrupt us, give up
123        LOG.debug("{} for region {}, targetServer {} is dead, SCP will interrupt us, give up", this,
124          regionNode, remote);
125        return;
126      }
127      if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) {
128        // not sure how can this happen but anyway let's add a check here to avoid waking the wrong
129        // procedure...
130        LOG.warn("{} for region {}, targetServer={} has already been woken up, ignore", this,
131          regionNode, remote);
132        return;
133      }
134      LOG.warn("The remote operation {} for region {} to server {} failed", this, regionNode,
135        remote, exception);
136      // It is OK to not persist the state here, as we do not need to change the region state if the
137      // remote call is failed. If the master crashed before we actually execute the procedure and
138      // persist the new state, it is fine to retry on the same target server again.
139      state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH_FAIL;
140      regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
141    } finally {
142      regionNode.unlock();
143    }
144  }
145
146  @Override
147  public TableName getTableName() {
148    return region.getTable();
149  }
150
151  @Override
152  protected boolean waitInitialized(MasterProcedureEnv env) {
153    if (TableName.isMetaTableName(getTableName())) {
154      return false;
155    }
156    // First we need meta to be loaded, and second, if meta is not online then we will likely to
157    // fail when updating meta so we wait until it is assigned.
158    AssignmentManager am = env.getAssignmentManager();
159    return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, region);
160  }
161
162  @Override
163  protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException {
164    throw new UnsupportedOperationException();
165  }
166
167  @Override
168  protected boolean abort(MasterProcedureEnv env) {
169    return false;
170  }
171
172  // do some checks to see if the report is valid
173  protected abstract void checkTransition(RegionStateNode regionNode, TransitionCode transitionCode,
174      long seqId) throws UnexpectedStateException;
175
176  // change the in memory state of the regionNode, but do not update meta.
177  protected abstract void updateTransitionWithoutPersistingToMeta(MasterProcedureEnv env,
178      RegionStateNode regionNode, TransitionCode transitionCode, long seqId) throws IOException;
179
180  // A bit strange but the procedure store will throw RuntimeException if we can not persist the
181  // state, so upper layer should take care of this...
182  private void persistAndWake(MasterProcedureEnv env, RegionStateNode regionNode) {
183    env.getMasterServices().getMasterProcedureExecutor().getStore().update(this);
184    regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
185  }
186
187  // should be called with RegionStateNode locked, to avoid race with the execute method below
188  void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName,
189      TransitionCode transitionCode, long seqId) throws IOException {
190    if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) {
191      // should be a retry
192      return;
193    }
194    if (!targetServer.equals(serverName)) {
195      throw new UnexpectedStateException("Received report from " + serverName + ", expected " +
196        targetServer + ", " + regionNode + ", proc=" + this);
197    }
198    checkTransition(regionNode, transitionCode, seqId);
199    // this state means we have received the report from RS, does not mean the result is fine, as we
200    // may received a FAILED_OPEN.
201    this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED;
202    this.transitionCode = transitionCode;
203    this.seqId = seqId;
204    // Persist the transition code and openSeqNum(if provided).
205    // We should not update the hbase:meta directly as this may cause races when master restarts,
206    // as the old active master may incorrectly report back to RS and cause the new master to hang
207    // on a OpenRegionProcedure forever. See HBASE-22060 and HBASE-22074 for more details.
208    boolean succ = false;
209    try {
210      persistAndWake(env, regionNode);
211      succ = true;
212    } finally {
213      if (!succ) {
214        this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH;
215        this.transitionCode = null;
216        this.seqId = HConstants.NO_SEQNUM;
217      }
218    }
219    try {
220      updateTransitionWithoutPersistingToMeta(env, regionNode, transitionCode, seqId);
221    } catch (IOException e) {
222      throw new AssertionError("should not happen", e);
223    }
224  }
225
226  void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName) {
227    if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH) {
228      // should be a retry
229      return;
230    }
231    RegionRemoteProcedureBaseState oldState = state;
232    // it is possible that the state is in REGION_REMOTE_PROCEDURE_SERVER_CRASH, think of this
233    // sequence
234    // 1. region is open on the target server and the above reportTransition call is succeeded
235    // 2. before we are woken up and update the meta, the target server crashes, and then we arrive
236    // here
237    this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH;
238    boolean succ = false;
239    try {
240      persistAndWake(env, regionNode);
241      succ = true;
242    } finally {
243      if (!succ) {
244        this.state = oldState;
245      }
246    }
247  }
248
249  protected abstract void restoreSucceedState(AssignmentManager am, RegionStateNode regionNode,
250      long seqId) throws IOException;
251
252  void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
253    if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
254      try {
255        restoreSucceedState(am, regionNode, seqId);
256      } catch (IOException e) {
257        // should not happen as we are just restoring the state
258        throw new AssertionError(e);
259      }
260    }
261  }
262
263  private TransitRegionStateProcedure getParent(MasterProcedureEnv env) {
264    return (TransitRegionStateProcedure) env.getMasterServices().getMasterProcedureExecutor()
265      .getProcedure(getParentProcId());
266  }
267
268  private void unattach(MasterProcedureEnv env) {
269    getParent(env).unattachRemoteProc(this);
270  }
271
272  @Override
273  protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
274      throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
275    RegionStateNode regionNode = getRegionNode(env);
276    regionNode.lock();
277    try {
278      switch (state) {
279        case REGION_REMOTE_PROCEDURE_DISPATCH: {
280          // The code which wakes us up also needs to lock the RSN so here we do not need to
281          // synchronize
282          // on the event.
283          ProcedureEvent<?> event = regionNode.getProcedureEvent();
284          try {
285            env.getRemoteDispatcher().addOperationToNode(targetServer, this);
286          } catch (FailedRemoteDispatchException e) {
287            LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " +
288              "because the server is alread dead, give up and mark the procedure as complete, " +
289              "the parent procedure will take care of this.", this, region, targetServer, e);
290            unattach(env);
291            return null;
292          }
293          event.suspend();
294          event.suspendIfNotReady(this);
295          throw new ProcedureSuspendedException();
296        }
297        case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED:
298          env.getAssignmentManager().persistToMeta(regionNode);
299          unattach(env);
300          return null;
301        case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL:
302          // the remote call is failed so we do not need to change the region state, just return.
303          unattach(env);
304          return null;
305        case REGION_REMOTE_PROCEDURE_SERVER_CRASH:
306          env.getAssignmentManager().regionClosedAbnormally(regionNode);
307          unattach(env);
308          return null;
309        default:
310          throw new IllegalStateException("Unknown state: " + state);
311      }
312    } catch (IOException e) {
313      if (retryCounter == null) {
314        retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
315      }
316      long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
317      LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e);
318      setTimeout(Math.toIntExact(backoff));
319      setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
320      skipPersistence();
321      throw new ProcedureSuspendedException();
322    } finally {
323      regionNode.unlock();
324    }
325  }
326
327  @Override
328  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
329    setState(ProcedureProtos.ProcedureState.RUNNABLE);
330    env.getProcedureScheduler().addFront(this);
331    return false; // 'false' means that this procedure handled the timeout
332  }
333
334  @Override
335  public boolean storeInDispatchedQueue() {
336    return false;
337  }
338
339  @Override
340  protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
341    RegionRemoteProcedureBaseStateData.Builder builder =
342      RegionRemoteProcedureBaseStateData.newBuilder().setRegion(ProtobufUtil.toRegionInfo(region))
343        .setTargetServer(ProtobufUtil.toServerName(targetServer)).setState(state);
344    if (transitionCode != null) {
345      builder.setTransitionCode(transitionCode);
346      builder.setSeqId(seqId);
347    }
348    serializer.serialize(builder.build());
349  }
350
351  @Override
352  protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
353    RegionRemoteProcedureBaseStateData data =
354      serializer.deserialize(RegionRemoteProcedureBaseStateData.class);
355    region = ProtobufUtil.toRegionInfo(data.getRegion());
356    targetServer = ProtobufUtil.toServerName(data.getTargetServer());
357    state = data.getState();
358    if (data.hasTransitionCode()) {
359      transitionCode = data.getTransitionCode();
360      seqId = data.getSeqId();
361    }
362  }
363
364  @Override
365  protected void afterReplay(MasterProcedureEnv env) {
366    getParent(env).attachRemoteProc(this);
367  }
368}