001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.client;
019
020
021import java.io.IOException;
022import java.io.InterruptedIOException;
023import java.util.Collections;
024import java.util.List;
025import java.util.concurrent.CancellationException;
026import java.util.concurrent.ExecutionException;
027import java.util.concurrent.ExecutorService;
028import java.util.concurrent.Future;
029import java.util.concurrent.TimeUnit;
030
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.hbase.DoNotRetryIOException;
033import org.apache.hadoop.hbase.HBaseIOException;
034import org.apache.hadoop.hbase.HRegionLocation;
035import org.apache.hadoop.hbase.RegionLocations;
036import org.apache.hadoop.hbase.TableName;
037import org.apache.yetus.audience.InterfaceAudience;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040import org.apache.hadoop.hbase.ipc.HBaseRpcController;
041import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
042import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
043import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
044import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
045import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
046
047import static org.apache.hadoop.hbase.HConstants.PRIORITY_UNSET;
048
049/**
050 * Caller that goes to replica if the primary region does no answer within a configurable
051 * timeout. If the timeout is reached, it calls all the secondary replicas, and returns
052 * the first answer. If the answer comes from one of the secondary replica, it will
053 * be marked as stale.
054 */
055@InterfaceAudience.Private
056public class RpcRetryingCallerWithReadReplicas {
057  private static final Logger LOG =
058      LoggerFactory.getLogger(RpcRetryingCallerWithReadReplicas.class);
059
060  protected final ExecutorService pool;
061  protected final ClusterConnection cConnection;
062  protected final Configuration conf;
063  protected final Get get;
064  protected final TableName tableName;
065  protected final int timeBeforeReplicas;
066  private final int operationTimeout;
067  private final int rpcTimeout;
068  private final int retries;
069  private final RpcControllerFactory rpcControllerFactory;
070  private final RpcRetryingCallerFactory rpcRetryingCallerFactory;
071
072  public RpcRetryingCallerWithReadReplicas(
073      RpcControllerFactory rpcControllerFactory, TableName tableName,
074      ClusterConnection cConnection, final Get get,
075      ExecutorService pool, int retries, int operationTimeout, int rpcTimeout,
076      int timeBeforeReplicas) {
077    this.rpcControllerFactory = rpcControllerFactory;
078    this.tableName = tableName;
079    this.cConnection = cConnection;
080    this.conf = cConnection.getConfiguration();
081    this.get = get;
082    this.pool = pool;
083    this.retries = retries;
084    this.operationTimeout = operationTimeout;
085    this.rpcTimeout = rpcTimeout;
086    this.timeBeforeReplicas = timeBeforeReplicas;
087    this.rpcRetryingCallerFactory = new RpcRetryingCallerFactory(conf);
088  }
089
090  /**
091   * A RegionServerCallable that takes into account the replicas, i.e.
092   * - the call can be on any replica
093   * - we need to stop retrying when the call is completed
094   * - we can be interrupted
095   */
096  class ReplicaRegionServerCallable extends CancellableRegionServerCallable<Result> {
097    final int id;
098    public ReplicaRegionServerCallable(int id, HRegionLocation location) {
099      super(RpcRetryingCallerWithReadReplicas.this.cConnection,
100          RpcRetryingCallerWithReadReplicas.this.tableName, get.getRow(),
101          rpcControllerFactory.newController(), rpcTimeout, new RetryingTimeTracker(), PRIORITY_UNSET);
102      this.id = id;
103      this.location = location;
104    }
105
106    /**
107     * Two responsibilities
108     * - if the call is already completed (by another replica) stops the retries.
109     * - set the location to the right region, depending on the replica.
110     */
111    @Override
112    // TODO: Very like the super class implemenation. Can we shrink this down?
113    public void prepare(final boolean reload) throws IOException {
114      if (getRpcController().isCanceled()) return;
115      if (Thread.interrupted()) {
116        throw new InterruptedIOException();
117      }
118      if (reload || location == null) {
119        RegionLocations rl = getRegionLocations(false, id, cConnection, tableName, get.getRow());
120        location = id < rl.size() ? rl.getRegionLocation(id) : null;
121      }
122
123      if (location == null || location.getServerName() == null) {
124        // With this exception, there will be a retry. The location can be null for a replica
125        //  when the table is created or after a split.
126        throw new HBaseIOException("There is no location for replica id #" + id);
127      }
128
129      setStubByServiceName(this.location.getServerName());
130    }
131
132    @Override
133    // TODO: Very like the super class implemenation. Can we shrink this down?
134    protected Result rpcCall() throws Exception {
135      if (getRpcController().isCanceled()) return null;
136      if (Thread.interrupted()) {
137        throw new InterruptedIOException();
138      }
139      byte[] reg = location.getRegionInfo().getRegionName();
140      ClientProtos.GetRequest request = RequestConverter.buildGetRequest(reg, get);
141      HBaseRpcController hrc = (HBaseRpcController)getRpcController();
142      hrc.reset();
143      hrc.setCallTimeout(rpcTimeout);
144      hrc.setPriority(tableName);
145      ClientProtos.GetResponse response = getStub().get(hrc, request);
146      if (response == null) {
147        return null;
148      }
149      return ProtobufUtil.toResult(response.getResult(), hrc.cellScanner());
150    }
151  }
152
153  /**
154   * <p>
155   * Algo:
156   * - we put the query into the execution pool.
157   * - after x ms, if we don't have a result, we add the queries for the secondary replicas
158   * - we take the first answer
159   * - when done, we cancel what's left. Cancelling means:
160   * - removing from the pool if the actual call was not started
161   * - interrupting the call if it has started
162   * Client side, we need to take into account
163   * - a call is not executed immediately after being put into the pool
164   * - a call is a thread. Let's not multiply the number of thread by the number of replicas.
165   * Server side, if we can cancel when it's still in the handler pool, it's much better, as a call
166   * can take some i/o.
167   * </p>
168   * Globally, the number of retries, timeout and so on still applies, but it's per replica,
169   * not global. We continue until all retries are done, or all timeouts are exceeded.
170   */
171  public Result call(int operationTimeout)
172      throws DoNotRetryIOException, InterruptedIOException, RetriesExhaustedException {
173    boolean isTargetReplicaSpecified = (get.getReplicaId() >= 0);
174
175    RegionLocations rl = null;
176    boolean skipPrimary = false;
177    try {
178      rl = getRegionLocations(true,
179        (isTargetReplicaSpecified ? get.getReplicaId() : RegionReplicaUtil.DEFAULT_REPLICA_ID),
180        cConnection, tableName, get.getRow());
181    } catch (RetriesExhaustedException | DoNotRetryIOException e) {
182      // When there is no specific replica id specified. It just needs to load all replicas.
183      if (isTargetReplicaSpecified) {
184        throw e;
185      } else {
186        // We cannot get the primary replica location, it is possible that the region
187        // server hosting meta is down, it needs to proceed to try cached replicas.
188        if (cConnection instanceof ConnectionImplementation) {
189          rl = ((ConnectionImplementation)cConnection).getCachedLocation(tableName, get.getRow());
190          if (rl == null) {
191            // No cached locations
192            throw e;
193          }
194
195          // Primary replica location is not known, skip primary replica
196          skipPrimary = true;
197        } else {
198          // For completeness
199          throw e;
200        }
201      }
202    }
203
204    final ResultBoundedCompletionService<Result> cs =
205        new ResultBoundedCompletionService<>(this.rpcRetryingCallerFactory, pool, rl.size());
206    int startIndex = 0;
207    int endIndex = rl.size();
208
209    if(isTargetReplicaSpecified) {
210      addCallsForReplica(cs, rl, get.getReplicaId(), get.getReplicaId());
211      endIndex = 1;
212    } else {
213      if (!skipPrimary) {
214        addCallsForReplica(cs, rl, 0, 0);
215        try {
216          // wait for the timeout to see whether the primary responds back
217          Future<Result> f = cs.poll(timeBeforeReplicas, TimeUnit.MICROSECONDS); // Yes, microseconds
218          if (f != null) {
219            return f.get(); //great we got a response
220          }
221          if (cConnection.getConnectionMetrics() != null) {
222            cConnection.getConnectionMetrics().incrHedgedReadOps();
223          }
224        } catch (ExecutionException e) {
225          // We ignore the ExecutionException and continue with the secondary replicas
226          if (LOG.isDebugEnabled()) {
227            LOG.debug("Primary replica returns " + e.getCause());
228          }
229
230          // Skip the result from the primary as we know that there is something wrong
231          startIndex = 1;
232        } catch (CancellationException e) {
233          throw new InterruptedIOException();
234        } catch (InterruptedException e) {
235          throw new InterruptedIOException();
236        }
237      } else {
238        // Since primary replica is skipped, the endIndex needs to be adjusted accordingly
239        endIndex --;
240      }
241
242      // submit call for the all of the secondaries at once
243      addCallsForReplica(cs, rl, 1, rl.size() - 1);
244    }
245    try {
246      ResultBoundedCompletionService<Result>.QueueingFuture<Result> f =
247          cs.pollForFirstSuccessfullyCompletedTask(operationTimeout, TimeUnit.MILLISECONDS, startIndex, endIndex);
248      if (f == null) {
249        throw new RetriesExhaustedException("Timed out after " + operationTimeout +
250            "ms. Get is sent to replicas with startIndex: " + startIndex +
251            ", endIndex: " + endIndex + ", Locations: " + rl);
252      }
253      if (cConnection.getConnectionMetrics() != null && !isTargetReplicaSpecified &&
254          !skipPrimary && f.getReplicaId() != RegionReplicaUtil.DEFAULT_REPLICA_ID) {
255        cConnection.getConnectionMetrics().incrHedgedReadWin();
256      }
257      return f.get();
258    } catch (ExecutionException e) {
259      throwEnrichedException(e, retries);
260    } catch (CancellationException e) {
261      throw new InterruptedIOException();
262    } catch (InterruptedException e) {
263      throw new InterruptedIOException();
264    } finally {
265      // We get there because we were interrupted or because one or more of the
266      // calls succeeded or failed. In all case, we stop all our tasks.
267      cs.cancelAll();
268    }
269
270    LOG.error("Imposible? Arrive at an unreachable line..."); // unreachable
271    return null; // unreachable
272  }
273
274  /**
275   * Extract the real exception from the ExecutionException, and throws what makes more
276   * sense.
277   */
278  static void throwEnrichedException(ExecutionException e, int retries)
279      throws RetriesExhaustedException, DoNotRetryIOException {
280    Throwable t = e.getCause();
281    assert t != null; // That's what ExecutionException is about: holding an exception
282
283    if (t instanceof RetriesExhaustedException) {
284      throw (RetriesExhaustedException) t;
285    }
286
287    if (t instanceof DoNotRetryIOException) {
288      throw (DoNotRetryIOException) t;
289    }
290
291    RetriesExhaustedException.ThrowableWithExtraContext qt =
292        new RetriesExhaustedException.ThrowableWithExtraContext(t,
293            EnvironmentEdgeManager.currentTime(), null);
294
295    List<RetriesExhaustedException.ThrowableWithExtraContext> exceptions =
296        Collections.singletonList(qt);
297
298    throw new RetriesExhaustedException(retries, exceptions);
299  }
300
301  /**
302   * Creates the calls and submit them
303   *
304   * @param cs  - the completion service to use for submitting
305   * @param rl  - the region locations
306   * @param min - the id of the first replica, inclusive
307   * @param max - the id of the last replica, inclusive.
308   */
309  private void addCallsForReplica(ResultBoundedCompletionService<Result> cs,
310                                 RegionLocations rl, int min, int max) {
311    for (int id = min; id <= max; id++) {
312      HRegionLocation hrl = rl.getRegionLocation(id);
313      ReplicaRegionServerCallable callOnReplica = new ReplicaRegionServerCallable(id, hrl);
314      cs.submit(callOnReplica, operationTimeout, id);
315    }
316  }
317
318  static RegionLocations getRegionLocations(boolean useCache, int replicaId,
319                 ClusterConnection cConnection, TableName tableName, byte[] row)
320      throws RetriesExhaustedException, DoNotRetryIOException, InterruptedIOException {
321
322    RegionLocations rl;
323    try {
324      if (useCache) {
325        rl = cConnection.locateRegion(tableName, row, true, true, replicaId);
326      } else {
327        rl = cConnection.relocateRegion(tableName, row, replicaId);
328      }
329    } catch (DoNotRetryIOException | InterruptedIOException | RetriesExhaustedException e) {
330      throw e;
331    } catch (IOException e) {
332      throw new RetriesExhaustedException("Can't get the location for replica " + replicaId, e);
333    }
334    if (rl == null) {
335      throw new RetriesExhaustedException("Can't get the location for replica " + replicaId);
336    }
337
338    return rl;
339  }
340}