001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.procedure;
019
020import java.io.IOException;
021import java.util.concurrent.Callable;
022import java.util.concurrent.CountDownLatch;
023import org.apache.hadoop.hbase.errorhandling.ForeignException;
024import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
025import org.apache.hadoop.hbase.errorhandling.ForeignExceptionListener;
026import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
027import org.apache.hadoop.hbase.errorhandling.TimeoutExceptionInjector;
028import org.apache.yetus.audience.InterfaceAudience;
029import org.apache.zookeeper.KeeperException;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033/**
034 * Distributed procedure member's Subprocedure. A procedure is sarted on a ProcedureCoordinator
035 * which communicates with ProcedureMembers who create and start its part of the Procedure. This sub
036 * part is called a Subprocedure Users should subclass this and implement {@link #acquireBarrier()}
037 * (get local barrier for this member), {@link #insideBarrier()} (execute while globally barriered
038 * and release barrier) and {@link #cleanup(Exception)} (release state associated with
039 * subprocedure.) When submitted to a ProcedureMember, the call method is executed in a separate
040 * thread. Latches are use too block its progress and trigger continuations when barrier conditions
041 * are met. Exception that makes it out of calls to {@link #acquireBarrier()} or
042 * {@link #insideBarrier()} gets converted into {@link ForeignException}, which will get propagated
043 * to the {@link ProcedureCoordinator}. There is a category of procedure (ex: online-snapshots), and
044 * a user-specified instance-specific barrierName. (ex: snapshot121126).
045 */
046@InterfaceAudience.Private
047abstract public class Subprocedure implements Callable<Void> {
048  private static final Logger LOG = LoggerFactory.getLogger(Subprocedure.class);
049
050  // Name of the procedure
051  final private String barrierName;
052
053  //
054  // Execution state
055  //
056
057  /** wait on before allowing the in barrier phase to proceed */
058  private final CountDownLatch inGlobalBarrier;
059  /** counted down when the Subprocedure has completed */
060  private final CountDownLatch releasedLocalBarrier;
061
062  //
063  // Error handling
064  //
065  /** monitor to check for errors */
066  protected final ForeignExceptionDispatcher monitor;
067  /** frequency to check for errors (ms) */
068  protected final long wakeFrequency;
069  protected final TimeoutExceptionInjector executionTimeoutTimer;
070  protected final ProcedureMemberRpcs rpcs;
071
072  private volatile boolean complete = false;
073
074  /**
075   * @param member        reference to the member managing this subprocedure
076   * @param procName      name of the procedure this subprocedure is associated with
077   * @param monitor       notified if there is an error in the subprocedure
078   * @param wakeFrequency time in millis to wake to check if there is an error via the monitor (in
079   *                      milliseconds).
080   * @param timeout       time in millis that will trigger a subprocedure abort if it has not
081   *                      completed
082   */
083  public Subprocedure(ProcedureMember member, String procName, ForeignExceptionDispatcher monitor,
084    long wakeFrequency, long timeout) {
085    // Asserts should be caught during unit testing
086    assert member != null : "procedure member should be non-null";
087    assert member.getRpcs() != null : "rpc handlers should be non-null";
088    assert procName != null : "procedure name should be non-null";
089    assert monitor != null : "monitor should be non-null";
090
091    // Default to a very large timeout
092    this.rpcs = member.getRpcs();
093    this.barrierName = procName;
094    this.monitor = monitor;
095    // forward any failures to coordinator. Since this is a dispatcher, resend loops should not be
096    // possible.
097    this.monitor.addListener(new ForeignExceptionListener() {
098      @Override
099      public void receive(ForeignException ee) {
100        // if this is a notification from a remote source, just log
101        if (ee.isRemote()) {
102          LOG.debug("Was remote foreign exception, not redispatching error", ee);
103          return;
104        }
105        // if this is a local KeeperException, don't attempt to notify other members
106        if (ee.getCause() instanceof KeeperException) {
107          LOG.debug("Was KeeperException, not redispatching error", ee);
108          return;
109        }
110        // if it is other local error, then send it to the coordinator
111        try {
112          rpcs.sendMemberAborted(Subprocedure.this, ee);
113        } catch (IOException e) {
114          // this will fail all the running procedures, since the connection is down
115          LOG.error("Can't reach controller, not propagating error", e);
116        }
117      }
118    });
119
120    this.wakeFrequency = wakeFrequency;
121    this.inGlobalBarrier = new CountDownLatch(1);
122    this.releasedLocalBarrier = new CountDownLatch(1);
123
124    // accept error from timer thread, this needs to be started.
125    this.executionTimeoutTimer = new TimeoutExceptionInjector(monitor, timeout);
126  }
127
128  public String getName() {
129    return barrierName;
130  }
131
132  public String getMemberName() {
133    return rpcs.getMemberName();
134  }
135
136  private void rethrowException() throws ForeignException {
137    monitor.rethrowException();
138  }
139
140  /**
141   * Execute the Subprocedure {@link #acquireBarrier()} and {@link #insideBarrier()} methods while
142   * keeping some state for other threads to access. This would normally be executed by the
143   * ProcedureMember when a acquire message comes from the coordinator. Rpcs are used to spend
144   * message back to the coordinator after different phases are executed. Any exceptions caught
145   * during the execution (except for InterruptedException) get converted and propagated to
146   * coordinator via {@link ProcedureMemberRpcs#sendMemberAborted( Subprocedure, ForeignException)}.
147   */
148  @SuppressWarnings("finally")
149  @Override
150  final public Void call() {
151    LOG.debug("Starting subprocedure '" + barrierName + "' with timeout "
152      + executionTimeoutTimer.getMaxTime() + "ms");
153    // start the execution timeout timer
154    executionTimeoutTimer.start();
155
156    try {
157      // start by checking for error first
158      rethrowException();
159      LOG.debug("Subprocedure '" + barrierName + "' starting 'acquire' stage");
160      acquireBarrier();
161      LOG.debug("Subprocedure '" + barrierName + "' locally acquired");
162      rethrowException();
163
164      // vote yes to coordinator about being prepared
165      rpcs.sendMemberAcquired(this);
166      LOG.debug("Subprocedure '" + barrierName + "' coordinator notified of 'acquire', waiting on"
167        + " 'reached' or 'abort' from coordinator");
168
169      // wait for the procedure to reach global barrier before proceding
170      waitForReachedGlobalBarrier();
171      rethrowException(); // if Coordinator aborts, will bail from here with exception
172
173      // In traditional 2PC, if a member reaches this state the TX has been committed and the
174      // member is responsible for rolling forward and recovering and completing the subsequent
175      // operations in the case of failure. It cannot rollback.
176      //
177      // This implementation is not 2PC since it can still rollback here, and thus has different
178      // semantics.
179
180      LOG.debug("Subprocedure '" + barrierName + "' received 'reached' from coordinator.");
181      byte[] dataToCoordinator = insideBarrier();
182      LOG.debug("Subprocedure '" + barrierName + "' locally completed");
183      rethrowException();
184
185      // Ack that the member has executed and released local barrier
186      rpcs.sendMemberCompleted(this, dataToCoordinator);
187      LOG.debug("Subprocedure '" + barrierName + "' has notified controller of completion");
188
189      // make sure we didn't get an external exception
190      rethrowException();
191    } catch (Exception e) {
192      String msg = null;
193      if (e instanceof InterruptedException) {
194        msg = "Procedure '" + barrierName + "' aborting due to interrupt!"
195          + " Likely due to pool shutdown.";
196        Thread.currentThread().interrupt();
197      } else if (e instanceof ForeignException) {
198        msg = "Subprocedure '" + barrierName + "' aborting due to a ForeignException!";
199      } else {
200        msg = "Subprocedure '" + barrierName + "' failed!";
201      }
202      cancel(msg, e);
203
204      LOG.debug("Subprocedure '" + barrierName + "' running cleanup.");
205      cleanup(e);
206    } finally {
207      releasedLocalBarrier.countDown();
208
209      // tell the timer we are done, if we get here successfully
210      executionTimeoutTimer.complete();
211      complete = true;
212      LOG.debug("Subprocedure '" + barrierName + "' completed.");
213      return null;
214    }
215  }
216
217  boolean isComplete() {
218    return complete;
219  }
220
221  /**
222   * exposed for testing.
223   */
224  ForeignExceptionSnare getErrorCheckable() {
225    return this.monitor;
226  }
227
228  /**
229   * The implementation of this method should gather and hold required resources (locks, disk space,
230   * etc) to satisfy the Procedures barrier condition. For example, this would be where to make all
231   * the regions on a RS on the quiescent for an procedure that required all regions to be globally
232   * quiesed. Users should override this method. If a quiescent is not required, this is overkill
233   * but can still be used to execute a procedure on all members and to propagate any exceptions. n
234   */
235  abstract public void acquireBarrier() throws ForeignException;
236
237  /**
238   * The implementation of this method should act with the assumption that the barrier condition has
239   * been satisfied. Continuing the previous example, a condition could be that all RS's globally
240   * have been quiesced, and procedures that require this precondition could be implemented here.
241   * The implementation should also collect the result of the subprocedure as data to be returned to
242   * the coordinator upon successful completion. Users should override this method.
243   * @return the data the subprocedure wants to return to coordinator side. n
244   */
245  abstract public byte[] insideBarrier() throws ForeignException;
246
247  /**
248   * Users should override this method. This implementation of this method should rollback and
249   * cleanup any temporary or partially completed state that the {@link #acquireBarrier()} may have
250   * created. n
251   */
252  abstract public void cleanup(Exception e);
253
254  /**
255   * Method to cancel the Subprocedure by injecting an exception from and external source. n
256   */
257  public void cancel(String msg, Throwable cause) {
258    LOG.error(msg, cause);
259    complete = true;
260    if (cause instanceof ForeignException) {
261      monitor.receive((ForeignException) cause);
262    } else {
263      monitor.receive(new ForeignException(getMemberName(), cause));
264    }
265  }
266
267  /**
268   * Callback for the member rpcs to call when the global barrier has been reached. This unblocks
269   * the main subprocedure exectuion thread so that the Subprocedure's {@link #insideBarrier()}
270   * method can be run.
271   */
272  public void receiveReachedGlobalBarrier() {
273    inGlobalBarrier.countDown();
274  }
275
276  //
277  // Subprocedure Internal State interface
278  //
279
280  /**
281   * Wait for the reached global barrier notification. Package visibility for testing nn
282   */
283  void waitForReachedGlobalBarrier() throws ForeignException, InterruptedException {
284    Procedure.waitForLatch(inGlobalBarrier, monitor, wakeFrequency,
285      barrierName + ":remote acquired");
286  }
287
288  /**
289   * Waits until the entire procedure has globally completed, or has been aborted. nn
290   */
291  public void waitForLocallyCompleted() throws ForeignException, InterruptedException {
292    Procedure.waitForLatch(releasedLocalBarrier, monitor, wakeFrequency,
293      barrierName + ":completed");
294  }
295
296  /**
297   * Empty Subprocedure for testing. Must be public for stubbing used in testing to work.
298   */
299  public static class SubprocedureImpl extends Subprocedure {
300
301    public SubprocedureImpl(ProcedureMember member, String opName,
302      ForeignExceptionDispatcher monitor, long wakeFrequency, long timeout) {
303      super(member, opName, monitor, wakeFrequency, timeout);
304    }
305
306    @Override
307    public void acquireBarrier() throws ForeignException {
308    }
309
310    @Override
311    public byte[] insideBarrier() throws ForeignException {
312      return new byte[0];
313    }
314
315    @Override
316    public void cleanup(Exception e) {
317    }
318  };
319}