Source code

001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL;
021import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY;
022import static org.apache.hadoop.hbase.util.CollectionUtils.computeIfAbsent;
023
024import edu.umd.cs.findbugs.annotations.Nullable;
025import java.io.EOFException;
026import java.io.FileNotFoundException;
027import java.io.IOException;
028import java.io.InterruptedIOException;
029import java.lang.reflect.Constructor;
030import java.nio.ByteBuffer;
031import java.nio.charset.StandardCharsets;
032import java.text.ParseException;
033import java.util.AbstractList;
034import java.util.ArrayList;
035import java.util.Arrays;
036import java.util.Collection;
037import java.util.Collections;
038import java.util.HashMap;
039import java.util.HashSet;
040import java.util.Iterator;
041import java.util.List;
042import java.util.Map;
043import java.util.Map.Entry;
044import java.util.NavigableMap;
045import java.util.NavigableSet;
046import java.util.Optional;
047import java.util.RandomAccess;
048import java.util.Set;
049import java.util.TreeMap;
050import java.util.UUID;
051import java.util.concurrent.Callable;
052import java.util.concurrent.CompletionService;
053import java.util.concurrent.ConcurrentHashMap;
054import java.util.concurrent.ConcurrentMap;
055import java.util.concurrent.ConcurrentSkipListMap;
056import java.util.concurrent.ExecutionException;
057import java.util.concurrent.ExecutorCompletionService;
058import java.util.concurrent.ExecutorService;
059import java.util.concurrent.Executors;
060import java.util.concurrent.Future;
061import java.util.concurrent.FutureTask;
062import java.util.concurrent.ThreadFactory;
063import java.util.concurrent.ThreadPoolExecutor;
064import java.util.concurrent.TimeUnit;
065import java.util.concurrent.TimeoutException;
066import java.util.concurrent.atomic.AtomicBoolean;
067import java.util.concurrent.atomic.AtomicInteger;
068import java.util.concurrent.atomic.LongAdder;
069import java.util.concurrent.locks.Lock;
070import java.util.concurrent.locks.ReadWriteLock;
071import java.util.concurrent.locks.ReentrantReadWriteLock;
072import java.util.function.Function;
073import org.apache.hadoop.conf.Configuration;
074import org.apache.hadoop.fs.FileStatus;
075import org.apache.hadoop.fs.FileSystem;
076import org.apache.hadoop.fs.LocatedFileStatus;
077import org.apache.hadoop.fs.Path;
078import org.apache.hadoop.hbase.Cell;
079import org.apache.hadoop.hbase.CellBuilderType;
080import org.apache.hadoop.hbase.CellComparator;
081import org.apache.hadoop.hbase.CellComparatorImpl;
082import org.apache.hadoop.hbase.CellScanner;
083import org.apache.hadoop.hbase.CellUtil;
084import org.apache.hadoop.hbase.CompareOperator;
085import org.apache.hadoop.hbase.CompoundConfiguration;
086import org.apache.hadoop.hbase.DoNotRetryIOException;
087import org.apache.hadoop.hbase.DroppedSnapshotException;
088import org.apache.hadoop.hbase.ExtendedCellBuilderFactory;
089import org.apache.hadoop.hbase.HConstants;
090import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
091import org.apache.hadoop.hbase.HDFSBlocksDistribution;
092import org.apache.hadoop.hbase.KeyValue;
093import org.apache.hadoop.hbase.KeyValueUtil;
094import org.apache.hadoop.hbase.NamespaceDescriptor;
095import org.apache.hadoop.hbase.NotServingRegionException;
096import org.apache.hadoop.hbase.PrivateCellUtil;
097import org.apache.hadoop.hbase.RegionTooBusyException;
098import org.apache.hadoop.hbase.TableName;
099import org.apache.hadoop.hbase.Tag;
100import org.apache.hadoop.hbase.TagUtil;
101import org.apache.hadoop.hbase.UnknownScannerException;
102import org.apache.hadoop.hbase.client.Append;
103import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
104import org.apache.hadoop.hbase.client.CompactionState;
105import org.apache.hadoop.hbase.client.Delete;
106import org.apache.hadoop.hbase.client.Durability;
107import org.apache.hadoop.hbase.client.Get;
108import org.apache.hadoop.hbase.client.Increment;
109import org.apache.hadoop.hbase.client.IsolationLevel;
110import org.apache.hadoop.hbase.client.Mutation;
111import org.apache.hadoop.hbase.client.PackagePrivateFieldAccessor;
112import org.apache.hadoop.hbase.client.Put;
113import org.apache.hadoop.hbase.client.RegionInfo;
114import org.apache.hadoop.hbase.client.RegionInfoBuilder;
115import org.apache.hadoop.hbase.client.RegionReplicaUtil;
116import org.apache.hadoop.hbase.client.Result;
117import org.apache.hadoop.hbase.client.RowMutations;
118import org.apache.hadoop.hbase.client.Scan;
119import org.apache.hadoop.hbase.client.TableDescriptor;
120import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
121import org.apache.hadoop.hbase.conf.ConfigurationManager;
122import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
123import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
124import org.apache.hadoop.hbase.coprocessor.RegionObserver.MutationType;
125import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
126import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
127import org.apache.hadoop.hbase.exceptions.TimeoutIOException;
128import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
129import org.apache.hadoop.hbase.filter.ByteArrayComparable;
130import org.apache.hadoop.hbase.filter.FilterWrapper;
131import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
132import org.apache.hadoop.hbase.io.HFileLink;
133import org.apache.hadoop.hbase.io.HeapSize;
134import org.apache.hadoop.hbase.io.TimeRange;
135import org.apache.hadoop.hbase.io.hfile.HFile;
136import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
137import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
138import org.apache.hadoop.hbase.ipc.RpcCall;
139import org.apache.hadoop.hbase.ipc.RpcServer;
140import org.apache.hadoop.hbase.monitoring.MonitoredTask;
141import org.apache.hadoop.hbase.monitoring.TaskMonitor;
142import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry;
143import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
144import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
145import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
146import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker;
147import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
148import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
149import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
150import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
151import org.apache.hadoop.hbase.replication.ReplicationUtils;
152import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver;
153import org.apache.hadoop.hbase.security.User;
154import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
155import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
156import org.apache.hadoop.hbase.trace.TraceUtil;
157import org.apache.hadoop.hbase.util.Bytes;
158import org.apache.hadoop.hbase.util.CancelableProgressable;
159import org.apache.hadoop.hbase.util.ClassSize;
160import org.apache.hadoop.hbase.util.CollectionUtils;
161import org.apache.hadoop.hbase.util.CompressionTest;
162import org.apache.hadoop.hbase.util.EncryptionTest;
163import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
164import org.apache.hadoop.hbase.util.FSUtils;
165import org.apache.hadoop.hbase.util.HashedBytes;
166import org.apache.hadoop.hbase.util.NonceKey;
167import org.apache.hadoop.hbase.util.Pair;
168import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
169import org.apache.hadoop.hbase.util.Threads;
170import org.apache.hadoop.hbase.wal.WAL;
171import org.apache.hadoop.hbase.wal.WALEdit;
172import org.apache.hadoop.hbase.wal.WALFactory;
173import org.apache.hadoop.hbase.wal.WALKey;
174import org.apache.hadoop.hbase.wal.WALKeyImpl;
175import org.apache.hadoop.hbase.wal.WALSplitter;
176import org.apache.hadoop.hbase.wal.WALSplitter.MutationReplay;
177import org.apache.hadoop.io.MultipleIOException;
178import org.apache.hadoop.util.StringUtils;
179import org.apache.htrace.core.TraceScope;
180import org.apache.yetus.audience.InterfaceAudience;
181import org.slf4j.Logger;
182import org.slf4j.LoggerFactory;
183
184import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
185import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
186import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
187import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
188import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
189import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
190import org.apache.hbase.thirdparty.com.google.protobuf.Service;
191import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;
192import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations;
193
194import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
195import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
196import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall;
197import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad;
198import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
199import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
200import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
201import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor;
202import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor;
203import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
204import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
205import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor;
206import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
207import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor;
208
209/**
210 * Regions store data for a certain region of a table.  It stores all columns
211 * for each row. A given table consists of one or more Regions.
212 *
213 * <p>An Region is defined by its table and its key extent.
214 *
215 * <p>Locking at the Region level serves only one purpose: preventing the
216 * region from being closed (and consequently split) while other operations
217 * are ongoing. Each row level operation obtains both a row lock and a region
218 * read lock for the duration of the operation. While a scanner is being
219 * constructed, getScanner holds a read lock. If the scanner is successfully
220 * constructed, it holds a read lock until it is closed. A close takes out a
221 * write lock and consequently will block for ongoing operations and will block
222 * new operations from starting while the close is in progress.
223 */
224@SuppressWarnings("deprecation")
225@InterfaceAudience.Private
226public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
227  private static final Logger LOG = LoggerFactory.getLogger(HRegion.class);
228
229  public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
230    "hbase.hregion.scan.loadColumnFamiliesOnDemand";
231
232  public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize";
233  public static final int DEFAULT_MAX_CELL_SIZE = 10485760;
234
235  public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE =
236      "hbase.regionserver.minibatch.size";
237  public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000;
238
239  /**
240   * This is the global default value for durability. All tables/mutations not
241   * defining a durability or using USE_DEFAULT will default to this value.
242   */
243  private static final Durability DEFAULT_DURABILITY = Durability.SYNC_WAL;
244
245  final AtomicBoolean closed = new AtomicBoolean(false);
246
247  /* Closing can take some time; use the closing flag if there is stuff we don't
248   * want to do while in closing state; e.g. like offer this region up to the
249   * master as a region to close if the carrying regionserver is overloaded.
250   * Once set, it is never cleared.
251   */
252  final AtomicBoolean closing = new AtomicBoolean(false);
253
254  /**
255   * The max sequence id of flushed data on this region. There is no edit in memory that is
256   * less that this sequence id.
257   */
258  private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
259
260  /**
261   * Record the sequence id of last flush operation. Can be in advance of
262   * {@link #maxFlushedSeqId} when flushing a single column family. In this case,
263   * {@link #maxFlushedSeqId} will be older than the oldest edit in memory.
264   */
265  private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
266
267  /**
268   * The sequence id of the last replayed open region event from the primary region. This is used
269   * to skip entries before this due to the possibility of replay edits coming out of order from
270   * replication.
271   */
272  protected volatile long lastReplayedOpenRegionSeqId = -1L;
273  protected volatile long lastReplayedCompactionSeqId = -1L;
274
275  //////////////////////////////////////////////////////////////////////////////
276  // Members
277  //////////////////////////////////////////////////////////////////////////////
278
279  // map from a locked row to the context for that lock including:
280  // - CountDownLatch for threads waiting on that row
281  // - the thread that owns the lock (allow reentrancy)
282  // - reference count of (reentrant) locks held by the thread
283  // - the row itself
284  private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
285      new ConcurrentHashMap<>();
286
287  protected final Map<byte[], HStore> stores =
288      new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR);
289
290  // TODO: account for each registered handler in HeapSize computation
291  private Map<String, com.google.protobuf.Service> coprocessorServiceHandlers = Maps.newHashMap();
292
293  // Track data size in all memstores
294  private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing();
295  @VisibleForTesting
296  RegionServicesForStores regionServicesForStores;
297
298  // Debug possible data loss due to WAL off
299  final LongAdder numMutationsWithoutWAL = new LongAdder();
300  final LongAdder dataInMemoryWithoutWAL = new LongAdder();
301
302  // Debug why CAS operations are taking a while.
303  final LongAdder checkAndMutateChecksPassed = new LongAdder();
304  final LongAdder checkAndMutateChecksFailed = new LongAdder();
305
306  // Number of requests
307  // Count rows for scan
308  final LongAdder readRequestsCount = new LongAdder();
309  final LongAdder filteredReadRequestsCount = new LongAdder();
310  // Count rows for multi row mutations
311  final LongAdder writeRequestsCount = new LongAdder();
312
313  // Number of requests blocked by memstore size.
314  private final LongAdder blockedRequestsCount = new LongAdder();
315
316  // Compaction LongAdders
317  final LongAdder compactionsFinished = new LongAdder();
318  final LongAdder compactionsFailed = new LongAdder();
319  final LongAdder compactionNumFilesCompacted = new LongAdder();
320  final LongAdder compactionNumBytesCompacted = new LongAdder();
321  final LongAdder compactionsQueued = new LongAdder();
322  final LongAdder flushesQueued = new LongAdder();
323
324  private final WAL wal;
325  private final HRegionFileSystem fs;
326  protected final Configuration conf;
327  private final Configuration baseConf;
328  private final int rowLockWaitDuration;
329  static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
330
331  private Path regionDir;
332  private FileSystem walFS;
333
334  // The internal wait duration to acquire a lock before read/update
335  // from the region. It is not per row. The purpose of this wait time
336  // is to avoid waiting a long time while the region is busy, so that
337  // we can release the IPC handler soon enough to improve the
338  // availability of the region server. It can be adjusted by
339  // tuning configuration "hbase.busy.wait.duration".
340  final long busyWaitDuration;
341  static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
342
343  // If updating multiple rows in one call, wait longer,
344  // i.e. waiting for busyWaitDuration * # of rows. However,
345  // we can limit the max multiplier.
346  final int maxBusyWaitMultiplier;
347
348  // Max busy wait duration. There is no point to wait longer than the RPC
349  // purge timeout, when a RPC call will be terminated by the RPC engine.
350  final long maxBusyWaitDuration;
351
352  // Max cell size. If nonzero, the maximum allowed size for any given cell
353  // in bytes
354  final long maxCellSize;
355
356  // Number of mutations for minibatch processing.
357  private final int miniBatchSize;
358
359  // negative number indicates infinite timeout
360  static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
361  final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
362
363  private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
364
365  /**
366   * The sequence ID that was enLongAddered when this region was opened.
367   */
368  private long openSeqNum = HConstants.NO_SEQNUM;
369
370  /**
371   * The default setting for whether to enable on-demand CF loading for
372   * scan requests to this region. Requests can override it.
373   */
374  private boolean isLoadingCfsOnDemandDefault = false;
375
376  private final AtomicInteger majorInProgress = new AtomicInteger(0);
377  private final AtomicInteger minorInProgress = new AtomicInteger(0);
378
379  //
380  // Context: During replay we want to ensure that we do not lose any data. So, we
381  // have to be conservative in how we replay wals. For each store, we calculate
382  // the maxSeqId up to which the store was flushed. And, skip the edits which
383  // are equal to or lower than maxSeqId for each store.
384  // The following map is populated when opening the region
385  Map<byte[], Long> maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR);
386
387  /** Saved state from replaying prepare flush cache */
388  private PrepareFlushResult prepareFlushResult = null;
389
390  private volatile Optional<ConfigurationManager> configurationManager;
391
392  // Used for testing.
393  private volatile Long timeoutForWriteLock = null;
394
395  /**
396   * @return The smallest mvcc readPoint across all the scanners in this
397   * region. Writes older than this readPoint, are included in every
398   * read operation.
399   */
400  public long getSmallestReadPoint() {
401    long minimumReadPoint;
402    // We need to ensure that while we are calculating the smallestReadPoint
403    // no new RegionScanners can grab a readPoint that we are unaware of.
404    // We achieve this by synchronizing on the scannerReadPoints object.
405    synchronized (scannerReadPoints) {
406      minimumReadPoint = mvcc.getReadPoint();
407      for (Long readPoint : this.scannerReadPoints.values()) {
408        if (readPoint < minimumReadPoint) {
409          minimumReadPoint = readPoint;
410        }
411      }
412    }
413    return minimumReadPoint;
414  }
415
416  /*
417   * Data structure of write state flags used coordinating flushes,
418   * compactions and closes.
419   */
420  static class WriteState {
421    // Set while a memstore flush is happening.
422    volatile boolean flushing = false;
423    // Set when a flush has been requested.
424    volatile boolean flushRequested = false;
425    // Number of compactions running.
426    AtomicInteger compacting = new AtomicInteger(0);
427    // Gets set in close. If set, cannot compact or flush again.
428    volatile boolean writesEnabled = true;
429    // Set if region is read-only
430    volatile boolean readOnly = false;
431    // whether the reads are enabled. This is different than readOnly, because readOnly is
432    // static in the lifetime of the region, while readsEnabled is dynamic
433    volatile boolean readsEnabled = true;
434
435    /**
436     * Set flags that make this region read-only.
437     *
438     * @param onOff flip value for region r/o setting
439     */
440    synchronized void setReadOnly(final boolean onOff) {
441      this.writesEnabled = !onOff;
442      this.readOnly = onOff;
443    }
444
445    boolean isReadOnly() {
446      return this.readOnly;
447    }
448
449    boolean isFlushRequested() {
450      return this.flushRequested;
451    }
452
453    void setReadsEnabled(boolean readsEnabled) {
454      this.readsEnabled = readsEnabled;
455    }
456
457    static final long HEAP_SIZE = ClassSize.align(
458        ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
459  }
460
461  /**
462   * Objects from this class are created when flushing to describe all the different states that
463   * that method ends up in. The Result enum describes those states. The sequence id should only
464   * be specified if the flush was successful, and the failure message should only be specified
465   * if it didn't flush.
466   */
467  public static class FlushResultImpl implements FlushResult {
468    final Result result;
469    final String failureReason;
470    final long flushSequenceId;
471    final boolean wroteFlushWalMarker;
472
473    /**
474     * Convenience constructor to use when the flush is successful, the failure message is set to
475     * null.
476     * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
477     * @param flushSequenceId Generated sequence id that comes right after the edits in the
478     *                        memstores.
479     */
480    FlushResultImpl(Result result, long flushSequenceId) {
481      this(result, flushSequenceId, null, false);
482      assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
483          .FLUSHED_COMPACTION_NEEDED;
484    }
485
486    /**
487     * Convenience constructor to use when we cannot flush.
488     * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
489     * @param failureReason Reason why we couldn't flush.
490     */
491    FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
492      this(result, -1, failureReason, wroteFlushMarker);
493      assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
494    }
495
496    /**
497     * Constructor with all the parameters.
498     * @param result Any of the Result.
499     * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
500     * @param failureReason Reason why we couldn't flush, or null.
501     */
502    FlushResultImpl(Result result, long flushSequenceId, String failureReason,
503      boolean wroteFlushMarker) {
504      this.result = result;
505      this.flushSequenceId = flushSequenceId;
506      this.failureReason = failureReason;
507      this.wroteFlushWalMarker = wroteFlushMarker;
508    }
509
510    /**
511     * Convenience method, the equivalent of checking if result is
512     * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
513     * @return true if the memstores were flushed, else false.
514     */
515    @Override
516    public boolean isFlushSucceeded() {
517      return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
518          .FLUSHED_COMPACTION_NEEDED;
519    }
520
521    /**
522     * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
523     * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
524     */
525    @Override
526    public boolean isCompactionNeeded() {
527      return result == Result.FLUSHED_COMPACTION_NEEDED;
528    }
529
530    @Override
531    public String toString() {
532      return new StringBuilder()
533        .append("flush result:").append(result).append(", ")
534        .append("failureReason:").append(failureReason).append(",")
535        .append("flush seq id").append(flushSequenceId).toString();
536    }
537
538    @Override
539    public Result getResult() {
540      return result;
541    }
542  }
543
544  /** A result object from prepare flush cache stage */
545  @VisibleForTesting
546  static class PrepareFlushResult {
547    final FlushResultImpl result; // indicating a failure result from prepare
548    final TreeMap<byte[], StoreFlushContext> storeFlushCtxs;
549    final TreeMap<byte[], List<Path>> committedFiles;
550    final TreeMap<byte[], MemStoreSize> storeFlushableSize;
551    final long startTime;
552    final long flushOpSeqId;
553    final long flushedSeqId;
554    final MemStoreSizing totalFlushableSize;
555
556    /** Constructs an early exit case */
557    PrepareFlushResult(FlushResultImpl result, long flushSeqId) {
558      this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD);
559    }
560
561    /** Constructs a successful prepare flush result */
562    PrepareFlushResult(
563      TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
564      TreeMap<byte[], List<Path>> committedFiles,
565      TreeMap<byte[], MemStoreSize> storeFlushableSize, long startTime, long flushSeqId,
566      long flushedSeqId, MemStoreSizing totalFlushableSize) {
567      this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
568        flushSeqId, flushedSeqId, totalFlushableSize);
569    }
570
571    private PrepareFlushResult(
572        FlushResultImpl result,
573      TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
574      TreeMap<byte[], List<Path>> committedFiles,
575      TreeMap<byte[], MemStoreSize> storeFlushableSize, long startTime, long flushSeqId,
576      long flushedSeqId, MemStoreSizing totalFlushableSize) {
577      this.result = result;
578      this.storeFlushCtxs = storeFlushCtxs;
579      this.committedFiles = committedFiles;
580      this.storeFlushableSize = storeFlushableSize;
581      this.startTime = startTime;
582      this.flushOpSeqId = flushSeqId;
583      this.flushedSeqId = flushedSeqId;
584      this.totalFlushableSize = totalFlushableSize;
585    }
586
587    public FlushResult getResult() {
588      return this.result;
589    }
590  }
591
592  /**
593   * A class that tracks exceptions that have been observed in one batch. Not thread safe.
594   */
595  static class ObservedExceptionsInBatch {
596    private boolean wrongRegion = false;
597    private boolean failedSanityCheck = false;
598    private boolean wrongFamily = false;
599
600    /**
601     * @return If a {@link WrongRegionException} has been observed.
602     */
603    boolean hasSeenWrongRegion() {
604      return wrongRegion;
605    }
606
607    /**
608     * Records that a {@link WrongRegionException} has been observed.
609     */
610    void sawWrongRegion() {
611      wrongRegion = true;
612    }
613
614    /**
615     * @return If a {@link FailedSanityCheckException} has been observed.
616     */
617    boolean hasSeenFailedSanityCheck() {
618      return failedSanityCheck;
619    }
620
621    /**
622     * Records that a {@link FailedSanityCheckException} has been observed.
623     */
624    void sawFailedSanityCheck() {
625      failedSanityCheck = true;
626    }
627
628    /**
629     * @return If a {@link NoSuchColumnFamilyException} has been observed.
630     */
631    boolean hasSeenNoSuchFamily() {
632      return wrongFamily;
633    }
634
635    /**
636     * Records that a {@link NoSuchColumnFamilyException} has been observed.
637     */
638    void sawNoSuchFamily() {
639      wrongFamily = true;
640    }
641  }
642
643  final WriteState writestate = new WriteState();
644
645  long memstoreFlushSize;
646  final long timestampSlop;
647  final long rowProcessorTimeout;
648
649  // Last flush time for each Store. Useful when we are flushing for each column
650  private final ConcurrentMap<HStore, Long> lastStoreFlushTimeMap = new ConcurrentHashMap<>();
651
652  final RegionServerServices rsServices;
653  private RegionServerAccounting rsAccounting;
654  private long flushCheckInterval;
655  // flushPerChanges is to prevent too many changes in memstore
656  private long flushPerChanges;
657  private long blockingMemStoreSize;
658  // Used to guard closes
659  final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
660
661  // Stop updates lock
662  private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
663  private boolean splitRequest;
664  private byte[] explicitSplitPoint = null;
665
666  private final MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
667
668  // Coprocessor host
669  private RegionCoprocessorHost coprocessorHost;
670
671  private TableDescriptor htableDescriptor = null;
672  private RegionSplitPolicy splitPolicy;
673  private FlushPolicy flushPolicy;
674
675  private final MetricsRegion metricsRegion;
676  private final MetricsRegionWrapperImpl metricsRegionWrapper;
677  private final Durability regionDurability;
678  private final boolean regionStatsEnabled;
679  // Stores the replication scope of the various column families of the table
680  // that has non-default scope
681  private final NavigableMap<byte[], Integer> replicationScope = new TreeMap<>(
682      Bytes.BYTES_COMPARATOR);
683
684  /**
685   * HRegion constructor. This constructor should only be used for testing and
686   * extensions.  Instances of HRegion should be instantiated with the
687   * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
688   *
689   * @param tableDir qualified path of directory where region should be located,
690   * usually the table directory.
691   * @param wal The WAL is the outbound log for any updates to the HRegion
692   * The wal file is a logfile from the previous execution that's
693   * custom-computed for this HRegion. The HRegionServer computes and sorts the
694   * appropriate wal info for this HRegion. If there is a previous wal file
695   * (implying that the HRegion has been written-to before), then read it from
696   * the supplied path.
697   * @param fs is the filesystem.
698   * @param confParam is global configuration settings.
699   * @param regionInfo - RegionInfo that describes the region
700   * is new), then read them from the supplied path.
701   * @param htd the table descriptor
702   * @param rsServices reference to {@link RegionServerServices} or null
703   * @deprecated Use other constructors.
704   */
705  @Deprecated
706  @VisibleForTesting
707  public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
708      final Configuration confParam, final RegionInfo regionInfo,
709      final TableDescriptor htd, final RegionServerServices rsServices) {
710    this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
711      wal, confParam, htd, rsServices);
712  }
713
714  /**
715   * HRegion constructor. This constructor should only be used for testing and
716   * extensions.  Instances of HRegion should be instantiated with the
717   * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
718   *
719   * @param fs is the filesystem.
720   * @param wal The WAL is the outbound log for any updates to the HRegion
721   * The wal file is a logfile from the previous execution that's
722   * custom-computed for this HRegion. The HRegionServer computes and sorts the
723   * appropriate wal info for this HRegion. If there is a previous wal file
724   * (implying that the HRegion has been written-to before), then read it from
725   * the supplied path.
726   * @param confParam is global configuration settings.
727   * @param htd the table descriptor
728   * @param rsServices reference to {@link RegionServerServices} or null
729   */
730  public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
731      final TableDescriptor htd, final RegionServerServices rsServices) {
732    if (htd == null) {
733      throw new IllegalArgumentException("Need table descriptor");
734    }
735
736    if (confParam instanceof CompoundConfiguration) {
737      throw new IllegalArgumentException("Need original base configuration");
738    }
739
740    this.wal = wal;
741    this.fs = fs;
742
743    // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
744    this.baseConf = confParam;
745    this.conf = new CompoundConfiguration()
746      .add(confParam)
747      .addBytesMap(htd.getValues());
748    this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
749        DEFAULT_CACHE_FLUSH_INTERVAL);
750    this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
751    if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
752      throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
753          + MAX_FLUSH_PER_CHANGES);
754    }
755    this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
756                    DEFAULT_ROWLOCK_WAIT_DURATION);
757
758    this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
759    this.htableDescriptor = htd;
760    Set<byte[]> families = this.htableDescriptor.getColumnFamilyNames();
761    for (byte[] family : families) {
762      if (!replicationScope.containsKey(family)) {
763        int scope = htd.getColumnFamily(family).getScope();
764        // Only store those families that has NON-DEFAULT scope
765        if (scope != REPLICATION_SCOPE_LOCAL) {
766          // Do a copy before storing it here.
767          replicationScope.put(Bytes.copy(family), scope);
768        }
769      }
770    }
771    this.rsServices = rsServices;
772    this.regionServicesForStores = new RegionServicesForStores(this, rsServices);
773    setHTableSpecificConf();
774    this.scannerReadPoints = new ConcurrentHashMap<>();
775
776    this.busyWaitDuration = conf.getLong(
777      "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
778    this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
779    if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
780      throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
781        + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
782        + maxBusyWaitMultiplier + "). Their product should be positive");
783    }
784    this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
785      2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
786
787    /*
788     * timestamp.slop provides a server-side constraint on the timestamp. This
789     * assumes that you base your TS around currentTimeMillis(). In this case,
790     * throw an error to the user if the user-specified TS is newer than now +
791     * slop. LATEST_TIMESTAMP == don't use this functionality
792     */
793    this.timestampSlop = conf.getLong(
794        "hbase.hregion.keyvalue.timestamp.slop.millisecs",
795        HConstants.LATEST_TIMESTAMP);
796
797    /**
798     * Timeout for the process time in processRowsWithLocks().
799     * Use -1 to switch off time bound.
800     */
801    this.rowProcessorTimeout = conf.getLong(
802        "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
803    this.regionDurability = htd.getDurability() == Durability.USE_DEFAULT ?
804        DEFAULT_DURABILITY : htd.getDurability();
805
806    decorateRegionConfiguration(conf);
807    if (rsServices != null) {
808      this.rsAccounting = this.rsServices.getRegionServerAccounting();
809      // don't initialize coprocessors if not running within a regionserver
810      // TODO: revisit if coprocessors should load in other cases
811      this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
812      this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
813      this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
814    } else {
815      this.metricsRegionWrapper = null;
816      this.metricsRegion = null;
817    }
818    if (LOG.isDebugEnabled()) {
819      // Write out region name as string and its encoded name.
820      LOG.debug("Instantiated " + this);
821    }
822
823    configurationManager = Optional.empty();
824
825    // disable stats tracking system tables, but check the config for everything else
826    this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
827        NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
828          false :
829          conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
830              HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
831
832    this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE);
833    this.miniBatchSize = conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE,
834        DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE);
835  }
836
837  void setHTableSpecificConf() {
838    if (this.htableDescriptor == null) return;
839    long flushSize = this.htableDescriptor.getMemStoreFlushSize();
840
841    if (flushSize <= 0) {
842      flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
843          TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE);
844    }
845    this.memstoreFlushSize = flushSize;
846    long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
847        HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
848    this.blockingMemStoreSize = this.memstoreFlushSize * mult;
849  }
850
851  /**
852   * Initialize this region.
853   * Used only by tests and SplitTransaction to reopen the region.
854   * You should use createHRegion() or openHRegion()
855   * @return What the next sequence (edit) id should be.
856   * @throws IOException e
857   * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
858   */
859  @Deprecated
860  public long initialize() throws IOException {
861    return initialize(null);
862  }
863
864  /**
865   * Initialize this region.
866   *
867   * @param reporter Tickle every so often if initialize is taking a while.
868   * @return What the next sequence (edit) id should be.
869   * @throws IOException e
870   */
871  @VisibleForTesting
872  long initialize(final CancelableProgressable reporter) throws IOException {
873
874    //Refuse to open the region if there is no column family in the table
875    if (htableDescriptor.getColumnFamilyCount() == 0) {
876      throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString()+
877          " should have at least one column family.");
878    }
879
880    MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
881    long nextSeqId = -1;
882    try {
883      nextSeqId = initializeRegionInternals(reporter, status);
884      return nextSeqId;
885    } catch (IOException e) {
886      LOG.warn("Failed initialize of region= {}, starting to roll back memstore",
887          getRegionInfo().getRegionNameAsString(), e);
888      // global memstore size will be decreased when dropping memstore
889      try {
890        //drop the memory used by memstore if open region fails
891        dropMemStoreContents();
892      } catch (IOException ioE) {
893        if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) {
894          LOG.warn("Failed drop memstore of region= {}, "
895                  + "some chunks may not released forever since MSLAB is enabled",
896              getRegionInfo().getRegionNameAsString());
897        }
898
899      }
900      throw e;
901    } finally {
902      // nextSeqid will be -1 if the initialization fails.
903      // At least it will be 0 otherwise.
904      if (nextSeqId == -1) {
905        status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() +
906          " initialization.");
907      }
908    }
909  }
910
911  private long initializeRegionInternals(final CancelableProgressable reporter,
912      final MonitoredTask status) throws IOException {
913    if (coprocessorHost != null) {
914      status.setStatus("Running coprocessor pre-open hook");
915      coprocessorHost.preOpen();
916    }
917
918    // Write HRI to a file in case we need to recover hbase:meta
919    // Only the primary replica should write .regioninfo
920    if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) {
921      status.setStatus("Writing region info on filesystem");
922      fs.checkRegionInfoOnFilesystem();
923    }
924
925    // Initialize all the HStores
926    status.setStatus("Initializing all the Stores");
927    long maxSeqId = initializeStores(reporter, status);
928    this.mvcc.advanceTo(maxSeqId);
929    if (ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) {
930      Collection<HStore> stores = this.stores.values();
931      try {
932        // update the stores that we are replaying
933        stores.forEach(HStore::startReplayingFromWAL);
934        // Recover any edits if available.
935        maxSeqId = Math.max(maxSeqId,
936          replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status));
937        // Make sure mvcc is up to max.
938        this.mvcc.advanceTo(maxSeqId);
939      } finally {
940        // update the stores that we are done replaying
941        stores.forEach(HStore::stopReplayingFromWAL);
942      }
943    }
944    this.lastReplayedOpenRegionSeqId = maxSeqId;
945
946    this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
947    this.writestate.flushRequested = false;
948    this.writestate.compacting.set(0);
949
950    if (this.writestate.writesEnabled) {
951      // Remove temporary data left over from old regions
952      status.setStatus("Cleaning up temporary data from old regions");
953      fs.cleanupTempDir();
954    }
955
956    if (this.writestate.writesEnabled) {
957      status.setStatus("Cleaning up detritus from prior splits");
958      // Get rid of any splits or merges that were lost in-progress.  Clean out
959      // these directories here on open.  We may be opening a region that was
960      // being split but we crashed in the middle of it all.
961      fs.cleanupAnySplitDetritus();
962      fs.cleanupMergesDir();
963    }
964
965    // Initialize split policy
966    this.splitPolicy = RegionSplitPolicy.create(this, conf);
967
968    // Initialize flush policy
969    this.flushPolicy = FlushPolicyFactory.create(this, conf);
970
971    long lastFlushTime = EnvironmentEdgeManager.currentTime();
972    for (HStore store: stores.values()) {
973      this.lastStoreFlushTimeMap.put(store, lastFlushTime);
974    }
975
976    // Use maximum of log sequenceid or that which was found in stores
977    // (particularly if no recovered edits, seqid will be -1).
978    // always get openSeqNum from the default replica, even if we are secondary replicas
979    long maxSeqIdFromFile = WALSplitter.getMaxRegionSequenceId(conf,
980      RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem,
981      this::getWalFileSystem);
982    long nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1;
983    // The openSeqNum will always be increase even for read only region, as we rely on it to
984    // determine whether a region has been successfully reopened, so here we always need to update
985    // the max sequence id file.
986    if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) {
987      LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName());
988      WALSplitter.writeRegionSequenceIdFile(fs.getFileSystem(), getWALRegionDir(), nextSeqId);
989      // This means we have replayed all the recovered edits and also written out the max sequence
990      // id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617
991      // for more details.
992      Path wrongRegionWALDir = FSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(),
993        getRegionInfo().getEncodedName());
994      FileSystem walFs = getWalFileSystem();
995      if (walFs.exists(wrongRegionWALDir)) {
996        if (!walFs.delete(wrongRegionWALDir, true)) {
997          LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir);
998        }
999      }
1000    }
1001
1002    LOG.info("Opened {}; next sequenceid={}", this.getRegionInfo().getShortNameToLog(), nextSeqId);
1003
1004    // A region can be reopened if failed a split; reset flags
1005    this.closing.set(false);
1006    this.closed.set(false);
1007
1008    if (coprocessorHost != null) {
1009      status.setStatus("Running coprocessor post-open hooks");
1010      coprocessorHost.postOpen();
1011    }
1012
1013    status.markComplete("Region opened successfully");
1014    return nextSeqId;
1015  }
1016
1017  /**
1018   * Open all Stores.
1019   * @param reporter
1020   * @param status
1021   * @return Highest sequenceId found out in a Store.
1022   * @throws IOException
1023   */
1024  private long initializeStores(CancelableProgressable reporter, MonitoredTask status)
1025      throws IOException {
1026    // Load in all the HStores.
1027    long maxSeqId = -1;
1028    // initialized to -1 so that we pick up MemstoreTS from column families
1029    long maxMemstoreTS = -1;
1030
1031    if (htableDescriptor.getColumnFamilyCount() != 0) {
1032      // initialize the thread pool for opening stores in parallel.
1033      ThreadPoolExecutor storeOpenerThreadPool =
1034        getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
1035      CompletionService<HStore> completionService = new ExecutorCompletionService<>(storeOpenerThreadPool);
1036
1037      // initialize each store in parallel
1038      for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) {
1039        status.setStatus("Instantiating store for column family " + family);
1040        completionService.submit(new Callable<HStore>() {
1041          @Override
1042          public HStore call() throws IOException {
1043            return instantiateHStore(family);
1044          }
1045        });
1046      }
1047      boolean allStoresOpened = false;
1048      boolean hasSloppyStores = false;
1049      try {
1050        for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) {
1051          Future<HStore> future = completionService.take();
1052          HStore store = future.get();
1053          this.stores.put(store.getColumnFamilyDescriptor().getName(), store);
1054          if (store.isSloppyMemStore()) {
1055            hasSloppyStores = true;
1056          }
1057
1058          long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L);
1059          maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()),
1060              storeMaxSequenceId);
1061          if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
1062            maxSeqId = storeMaxSequenceId;
1063          }
1064          long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L);
1065          if (maxStoreMemstoreTS > maxMemstoreTS) {
1066            maxMemstoreTS = maxStoreMemstoreTS;
1067          }
1068        }
1069        allStoresOpened = true;
1070        if(hasSloppyStores) {
1071          htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor)
1072                  .setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName())
1073                  .build();
1074          LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this);
1075        }
1076      } catch (InterruptedException e) {
1077        throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1078      } catch (ExecutionException e) {
1079        throw new IOException(e.getCause());
1080      } finally {
1081        storeOpenerThreadPool.shutdownNow();
1082        if (!allStoresOpened) {
1083          // something went wrong, close all opened stores
1084          LOG.error("Could not initialize all stores for the region=" + this);
1085          for (HStore store : this.stores.values()) {
1086            try {
1087              store.close();
1088            } catch (IOException e) {
1089              LOG.warn("close store failed", e);
1090            }
1091          }
1092        }
1093      }
1094    }
1095    return Math.max(maxSeqId, maxMemstoreTS + 1);
1096  }
1097
1098  private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
1099    MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
1100    // Initialize all the HStores
1101    status.setStatus("Warming up all the Stores");
1102    try {
1103      initializeStores(reporter, status);
1104    } finally {
1105      status.markComplete("Done warming up.");
1106    }
1107  }
1108
1109  /**
1110   * @return Map of StoreFiles by column family
1111   */
1112  private NavigableMap<byte[], List<Path>> getStoreFiles() {
1113    NavigableMap<byte[], List<Path>> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
1114    for (HStore store : stores.values()) {
1115      Collection<HStoreFile> storeFiles = store.getStorefiles();
1116      if (storeFiles == null) {
1117        continue;
1118      }
1119      List<Path> storeFileNames = new ArrayList<>();
1120      for (HStoreFile storeFile : storeFiles) {
1121        storeFileNames.add(storeFile.getPath());
1122      }
1123      allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames);
1124    }
1125    return allStoreFiles;
1126  }
1127
1128  private void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
1129    Map<byte[], List<Path>> storeFiles = getStoreFiles();
1130    RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
1131      RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
1132      getRegionServerServices().getServerName(), storeFiles);
1133    WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc,
1134        mvcc);
1135  }
1136
1137  private void writeRegionCloseMarker(WAL wal) throws IOException {
1138    Map<byte[], List<Path>> storeFiles = getStoreFiles();
1139    RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
1140      RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(),
1141      getRegionServerServices().getServerName(), storeFiles);
1142    WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc,
1143        mvcc);
1144
1145    // Store SeqId in WAL FileSystem when a region closes
1146    // checking region folder exists is due to many tests which delete the table folder while a
1147    // table is still online
1148    if (getWalFileSystem().exists(getWALRegionDir())) {
1149      WALSplitter.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(),
1150        mvcc.getReadPoint());
1151    }
1152  }
1153
1154  /**
1155   * @return True if this region has references.
1156   */
1157  public boolean hasReferences() {
1158    return stores.values().stream().anyMatch(HStore::hasReferences);
1159  }
1160
1161  public void blockUpdates() {
1162    this.updatesLock.writeLock().lock();
1163  }
1164
1165  public void unblockUpdates() {
1166    this.updatesLock.writeLock().unlock();
1167  }
1168
1169  public HDFSBlocksDistribution getHDFSBlocksDistribution() {
1170    HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1171    stores.values().stream().filter(s -> s.getStorefiles() != null)
1172        .flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution)
1173        .forEachOrdered(hdfsBlocksDistribution::add);
1174    return hdfsBlocksDistribution;
1175  }
1176
1177  /**
1178   * This is a helper function to compute HDFS block distribution on demand
1179   * @param conf configuration
1180   * @param tableDescriptor TableDescriptor of the table
1181   * @param regionInfo encoded name of the region
1182   * @return The HDFS blocks distribution for the given region.
1183   * @throws IOException
1184   */
1185  public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf,
1186      TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException {
1187    Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
1188    return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
1189  }
1190
1191  /**
1192   * This is a helper function to compute HDFS block distribution on demand
1193   * @param conf configuration
1194   * @param tableDescriptor TableDescriptor of the table
1195   * @param regionInfo encoded name of the region
1196   * @param tablePath the table directory
1197   * @return The HDFS blocks distribution for the given region.
1198   * @throws IOException
1199   */
1200  public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf,
1201      TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException {
1202    HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1203    FileSystem fs = tablePath.getFileSystem(conf);
1204
1205    HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
1206    for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) {
1207      List<LocatedFileStatus> locatedFileStatusList = HRegionFileSystem
1208          .getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true);
1209      if (locatedFileStatusList == null) {
1210        continue;
1211      }
1212
1213      for (LocatedFileStatus status : locatedFileStatusList) {
1214        Path p = status.getPath();
1215        if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) {
1216          // Only construct StoreFileInfo object if its not a hfile, save obj
1217          // creation
1218          StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status);
1219          hdfsBlocksDistribution.add(storeFileInfo
1220              .computeHDFSBlocksDistribution(fs));
1221        } else if (StoreFileInfo.isHFile(p)) {
1222          // If its a HFile, then lets just add to the block distribution
1223          // lets not create more objects here, not even another HDFSBlocksDistribution
1224          FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution,
1225              status.getBlockLocations());
1226        } else {
1227          throw new IOException("path=" + p
1228              + " doesn't look like a valid StoreFile");
1229        }
1230      }
1231    }
1232    return hdfsBlocksDistribution;
1233  }
1234
1235  /**
1236   * Increase the size of mem store in this region and the size of global mem
1237   * store
1238   */
1239  void incMemStoreSize(MemStoreSize mss) {
1240    incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(),
1241      mss.getCellsCount());
1242  }
1243
1244  void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta,
1245      int cellsCountDelta) {
1246    if (this.rsAccounting != null) {
1247      rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta);
1248    }
1249    long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta,
1250      offHeapSizeDelta, cellsCountDelta);
1251    checkNegativeMemStoreDataSize(dataSize, dataSizeDelta);
1252  }
1253
1254  void decrMemStoreSize(MemStoreSize mss) {
1255    decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(),
1256      mss.getCellsCount());
1257  }
1258
1259  void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta,
1260      int cellsCountDelta) {
1261    if (this.rsAccounting != null) {
1262      rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta);
1263    }
1264    long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta,
1265      offHeapSizeDelta, cellsCountDelta);
1266    checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta);
1267  }
1268
1269  private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) {
1270    // This is extremely bad if we make memStoreSizing negative. Log as much info on the offending
1271    // caller as possible. (memStoreSizing might be a negative value already -- freeing memory)
1272    if (memStoreDataSize < 0) {
1273      LOG.error("Asked to modify this region's (" + this.toString()
1274          + ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing="
1275          + (memStoreDataSize - delta) + ", delta=" + delta, new Exception());
1276    }
1277  }
1278
1279  @Override
1280  public RegionInfo getRegionInfo() {
1281    return this.fs.getRegionInfo();
1282  }
1283
1284  /**
1285   * @return Instance of {@link RegionServerServices} used by this HRegion.
1286   * Can be null.
1287   */
1288  RegionServerServices getRegionServerServices() {
1289    return this.rsServices;
1290  }
1291
1292  @Override
1293  public long getReadRequestsCount() {
1294    return readRequestsCount.sum();
1295  }
1296
1297  @Override
1298  public long getFilteredReadRequestsCount() {
1299    return filteredReadRequestsCount.sum();
1300  }
1301
1302  @Override
1303  public long getWriteRequestsCount() {
1304    return writeRequestsCount.sum();
1305  }
1306
1307  @Override
1308  public long getMemStoreDataSize() {
1309    return memStoreSizing.getDataSize();
1310  }
1311
1312  @Override
1313  public long getMemStoreHeapSize() {
1314    return memStoreSizing.getHeapSize();
1315  }
1316
1317  @Override
1318  public long getMemStoreOffHeapSize() {
1319    return memStoreSizing.getOffHeapSize();
1320  }
1321
1322  /** @return store services for this region, to access services required by store level needs */
1323  public RegionServicesForStores getRegionServicesForStores() {
1324    return regionServicesForStores;
1325  }
1326
1327  @Override
1328  public long getNumMutationsWithoutWAL() {
1329    return numMutationsWithoutWAL.sum();
1330  }
1331
1332  @Override
1333  public long getDataInMemoryWithoutWAL() {
1334    return dataInMemoryWithoutWAL.sum();
1335  }
1336
1337  @Override
1338  public long getBlockedRequestsCount() {
1339    return blockedRequestsCount.sum();
1340  }
1341
1342  @Override
1343  public long getCheckAndMutateChecksPassed() {
1344    return checkAndMutateChecksPassed.sum();
1345  }
1346
1347  @Override
1348  public long getCheckAndMutateChecksFailed() {
1349    return checkAndMutateChecksFailed.sum();
1350  }
1351
1352  // TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing
1353  // the op and bypassing the core, this might be needed? Should be stop supporting the bypass
1354  // feature?
1355  public MetricsRegion getMetrics() {
1356    return metricsRegion;
1357  }
1358
1359  @Override
1360  public boolean isClosed() {
1361    return this.closed.get();
1362  }
1363
1364  @Override
1365  public boolean isClosing() {
1366    return this.closing.get();
1367  }
1368
1369  @Override
1370  public boolean isReadOnly() {
1371    return this.writestate.isReadOnly();
1372  }
1373
1374  @Override
1375  public boolean isAvailable() {
1376    return !isClosed() && !isClosing();
1377  }
1378
1379  @Override
1380  public boolean isSplittable() {
1381    return isAvailable() && !hasReferences();
1382  }
1383
1384  @Override
1385  public boolean isMergeable() {
1386    if (!isAvailable()) {
1387      LOG.debug("Region " + this
1388          + " is not mergeable because it is closing or closed");
1389      return false;
1390    }
1391    if (hasReferences()) {
1392      LOG.debug("Region " + this
1393          + " is not mergeable because it has references");
1394      return false;
1395    }
1396
1397    return true;
1398  }
1399
1400  public boolean areWritesEnabled() {
1401    synchronized(this.writestate) {
1402      return this.writestate.writesEnabled;
1403    }
1404  }
1405
1406  @VisibleForTesting
1407  public MultiVersionConcurrencyControl getMVCC() {
1408    return mvcc;
1409  }
1410
1411  @Override
1412  public long getMaxFlushedSeqId() {
1413    return maxFlushedSeqId;
1414  }
1415
1416  /**
1417   * @return readpoint considering given IsolationLevel. Pass {@code null} for default
1418   */
1419  public long getReadPoint(IsolationLevel isolationLevel) {
1420    if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1421      // This scan can read even uncommitted transactions
1422      return Long.MAX_VALUE;
1423    }
1424    return mvcc.getReadPoint();
1425  }
1426
1427  public boolean isLoadingCfsOnDemandDefault() {
1428    return this.isLoadingCfsOnDemandDefault;
1429  }
1430
1431  /**
1432   * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1433   * service any more calls.
1434   *
1435   * <p>This method could take some time to execute, so don't call it from a
1436   * time-sensitive thread.
1437   *
1438   * @return Vector of all the storage files that the HRegion's component
1439   * HStores make use of.  It's a list of all StoreFile objects. Returns empty
1440   * vector if already closed and null if judged that it should not close.
1441   *
1442   * @throws IOException e
1443   * @throws DroppedSnapshotException Thrown when replay of wal is required
1444   * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1445   * caller MUST abort after this.
1446   */
1447  public Map<byte[], List<HStoreFile>> close() throws IOException {
1448    return close(false);
1449  }
1450
1451  private final Object closeLock = new Object();
1452
1453  /** Conf key for the periodic flush interval */
1454  public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1455      "hbase.regionserver.optionalcacheflushinterval";
1456  /** Default interval for the memstore flush */
1457  public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1458  /** Default interval for System tables memstore flush */
1459  public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
1460
1461  /** Conf key to force a flush if there are already enough changes for one region in memstore */
1462  public static final String MEMSTORE_FLUSH_PER_CHANGES =
1463      "hbase.regionserver.flush.per.changes";
1464  public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1465  /**
1466   * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1467   * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1468   */
1469  public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1470
1471  /**
1472   * Close down this HRegion.  Flush the cache unless abort parameter is true,
1473   * Shut down each HStore, don't service any more calls.
1474   *
1475   * This method could take some time to execute, so don't call it from a
1476   * time-sensitive thread.
1477   *
1478   * @param abort true if server is aborting (only during testing)
1479   * @return Vector of all the storage files that the HRegion's component
1480   * HStores make use of.  It's a list of StoreFile objects.  Can be null if
1481   * we are not to close at this time or we are already closed.
1482   *
1483   * @throws IOException e
1484   * @throws DroppedSnapshotException Thrown when replay of wal is required
1485   * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1486   * caller MUST abort after this.
1487   */
1488  public Map<byte[], List<HStoreFile>> close(boolean abort) throws IOException {
1489    // Only allow one thread to close at a time. Serialize them so dual
1490    // threads attempting to close will run up against each other.
1491    MonitoredTask status = TaskMonitor.get().createStatus(
1492        "Closing region " + this.getRegionInfo().getEncodedName() +
1493        (abort ? " due to abort" : ""));
1494
1495    status.setStatus("Waiting for close lock");
1496    try {
1497      synchronized (closeLock) {
1498        return doClose(abort, status);
1499      }
1500    } finally {
1501      status.cleanup();
1502    }
1503  }
1504
1505  /**
1506   * Exposed for some very specific unit tests.
1507   */
1508  @VisibleForTesting
1509  public void setClosing(boolean closing) {
1510    this.closing.set(closing);
1511  }
1512
1513  /**
1514   * The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the unit test.
1515   * Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the timeout.
1516   * @param timeoutForWriteLock the second time to wait for the write lock in {@link HRegion#doClose}
1517   */
1518  @VisibleForTesting
1519  public void setTimeoutForWriteLock(long timeoutForWriteLock) {
1520    assert timeoutForWriteLock >= 0;
1521    this.timeoutForWriteLock = timeoutForWriteLock;
1522  }
1523
1524  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK_EXCEPTION_PATH",
1525      justification="I think FindBugs is confused")
1526  private Map<byte[], List<HStoreFile>> doClose(boolean abort, MonitoredTask status)
1527      throws IOException {
1528    if (isClosed()) {
1529      LOG.warn("Region " + this + " already closed");
1530      return null;
1531    }
1532
1533    if (coprocessorHost != null) {
1534      status.setStatus("Running coprocessor pre-close hooks");
1535      this.coprocessorHost.preClose(abort);
1536    }
1537    status.setStatus("Disabling compacts and flushes for region");
1538    boolean canFlush = true;
1539    synchronized (writestate) {
1540      // Disable compacting and flushing by background threads for this
1541      // region.
1542      canFlush = !writestate.readOnly;
1543      writestate.writesEnabled = false;
1544      LOG.debug("Closing {}, disabling compactions & flushes",
1545          this.getRegionInfo().getEncodedName());
1546      waitForFlushesAndCompactions();
1547    }
1548    // If we were not just flushing, is it worth doing a preflush...one
1549    // that will clear out of the bulk of the memstore before we put up
1550    // the close flag?
1551    if (!abort && worthPreFlushing() && canFlush) {
1552      status.setStatus("Pre-flushing region before close");
1553      LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName());
1554      try {
1555        internalFlushcache(status);
1556      } catch (IOException ioe) {
1557        // Failed to flush the region. Keep going.
1558        status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1559      }
1560    }
1561
1562    if (timeoutForWriteLock == null
1563        || timeoutForWriteLock == Long.MAX_VALUE) {
1564      // block waiting for the lock for closing
1565      lock.writeLock().lock(); // FindBugs: Complains UL_UNRELEASED_LOCK_EXCEPTION_PATH but seems fine
1566    } else {
1567      try {
1568        boolean succeed = lock.writeLock().tryLock(timeoutForWriteLock, TimeUnit.SECONDS);
1569        if (!succeed) {
1570          throw new IOException("Failed to get write lock when closing region");
1571        }
1572      } catch (InterruptedException e) {
1573        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
1574      }
1575    }
1576    this.closing.set(true);
1577    status.setStatus("Disabling writes for close");
1578    try {
1579      if (this.isClosed()) {
1580        status.abort("Already got closed by another process");
1581        // SplitTransaction handles the null
1582        return null;
1583      }
1584      LOG.debug("Updates disabled for region " + this);
1585      // Don't flush the cache if we are aborting
1586      if (!abort && canFlush) {
1587        int failedfFlushCount = 0;
1588        int flushCount = 0;
1589        long tmp = 0;
1590        long remainingSize = this.memStoreSizing.getDataSize();
1591        while (remainingSize > 0) {
1592          try {
1593            internalFlushcache(status);
1594            if(flushCount >0) {
1595              LOG.info("Running extra flush, " + flushCount +
1596                  " (carrying snapshot?) " + this);
1597            }
1598            flushCount++;
1599            tmp = this.memStoreSizing.getDataSize();
1600            if (tmp >= remainingSize) {
1601              failedfFlushCount++;
1602            }
1603            remainingSize = tmp;
1604            if (failedfFlushCount > 5) {
1605              // If we failed 5 times and are unable to clear memory, abort
1606              // so we do not lose data
1607              throw new DroppedSnapshotException("Failed clearing memory after " +
1608                  flushCount + " attempts on region: " +
1609                  Bytes.toStringBinary(getRegionInfo().getRegionName()));
1610            }
1611          } catch (IOException ioe) {
1612            status.setStatus("Failed flush " + this + ", putting online again");
1613            synchronized (writestate) {
1614              writestate.writesEnabled = true;
1615            }
1616            // Have to throw to upper layers.  I can't abort server from here.
1617            throw ioe;
1618          }
1619        }
1620      }
1621
1622      Map<byte[], List<HStoreFile>> result = new TreeMap<>(Bytes.BYTES_COMPARATOR);
1623      if (!stores.isEmpty()) {
1624        // initialize the thread pool for closing stores in parallel.
1625        ThreadPoolExecutor storeCloserThreadPool =
1626          getStoreOpenAndCloseThreadPool("StoreCloserThread-" +
1627            getRegionInfo().getRegionNameAsString());
1628        CompletionService<Pair<byte[], Collection<HStoreFile>>> completionService =
1629          new ExecutorCompletionService<>(storeCloserThreadPool);
1630
1631        // close each store in parallel
1632        for (HStore store : stores.values()) {
1633          MemStoreSize mss = store.getFlushableSize();
1634          if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) {
1635            if (getRegionServerServices() != null) {
1636              getRegionServerServices().abort("Assertion failed while closing store "
1637                + getRegionInfo().getRegionNameAsString() + " " + store
1638                + ". flushableSize expected=0, actual={" + mss
1639                + "}. Current memStoreSize=" + this.memStoreSizing.getMemStoreSize() +
1640                  ". Maybe a coprocessor "
1641                + "operation failed and left the memstore in a partially updated state.", null);
1642            }
1643          }
1644          completionService
1645              .submit(new Callable<Pair<byte[], Collection<HStoreFile>>>() {
1646                @Override
1647                public Pair<byte[], Collection<HStoreFile>> call() throws IOException {
1648                  return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close());
1649                }
1650              });
1651        }
1652        try {
1653          for (int i = 0; i < stores.size(); i++) {
1654            Future<Pair<byte[], Collection<HStoreFile>>> future = completionService.take();
1655            Pair<byte[], Collection<HStoreFile>> storeFiles = future.get();
1656            List<HStoreFile> familyFiles = result.get(storeFiles.getFirst());
1657            if (familyFiles == null) {
1658              familyFiles = new ArrayList<>();
1659              result.put(storeFiles.getFirst(), familyFiles);
1660            }
1661            familyFiles.addAll(storeFiles.getSecond());
1662          }
1663        } catch (InterruptedException e) {
1664          throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1665        } catch (ExecutionException e) {
1666          Throwable cause = e.getCause();
1667          if (cause instanceof IOException) {
1668            throw (IOException) cause;
1669          }
1670          throw new IOException(cause);
1671        } finally {
1672          storeCloserThreadPool.shutdownNow();
1673        }
1674      }
1675
1676      status.setStatus("Writing region close event to WAL");
1677      // Always write close marker to wal even for read only table. This is not a big problem as we
1678      // do not write any data into the region.
1679      if (!abort && wal != null && getRegionServerServices() != null &&
1680        RegionReplicaUtil.isDefaultReplica(getRegionInfo())) {
1681        writeRegionCloseMarker(wal);
1682      }
1683
1684      this.closed.set(true);
1685      if (!canFlush) {
1686        decrMemStoreSize(this.memStoreSizing.getMemStoreSize());
1687      } else if (this.memStoreSizing.getDataSize() != 0) {
1688        LOG.error("Memstore data size is {}", this.memStoreSizing.getDataSize());
1689      }
1690      if (coprocessorHost != null) {
1691        status.setStatus("Running coprocessor post-close hooks");
1692        this.coprocessorHost.postClose(abort);
1693      }
1694      if (this.metricsRegion != null) {
1695        this.metricsRegion.close();
1696      }
1697      if (this.metricsRegionWrapper != null) {
1698        Closeables.close(this.metricsRegionWrapper, true);
1699      }
1700      status.markComplete("Closed");
1701      LOG.info("Closed " + this);
1702      return result;
1703    } finally {
1704      lock.writeLock().unlock();
1705    }
1706  }
1707
1708  /** Wait for all current flushes and compactions of the region to complete */
1709  // TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for
1710  // Phoenix needs.
1711  public void waitForFlushesAndCompactions() {
1712    synchronized (writestate) {
1713      if (this.writestate.readOnly) {
1714        // we should not wait for replayed flushed if we are read only (for example in case the
1715        // region is a secondary replica).
1716        return;
1717      }
1718      boolean interrupted = false;
1719      try {
1720        while (writestate.compacting.get() > 0 || writestate.flushing) {
1721          LOG.debug("waiting for " + writestate.compacting + " compactions"
1722            + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1723          try {
1724            writestate.wait();
1725          } catch (InterruptedException iex) {
1726            // essentially ignore and propagate the interrupt back up
1727            LOG.warn("Interrupted while waiting");
1728            interrupted = true;
1729            break;
1730          }
1731        }
1732      } finally {
1733        if (interrupted) {
1734          Thread.currentThread().interrupt();
1735        }
1736      }
1737    }
1738  }
1739
1740  /**
1741   * Wait for all current flushes of the region to complete
1742   */
1743  public void waitForFlushes() {
1744    waitForFlushes(0);// Unbound wait
1745  }
1746
1747  @Override
1748  public boolean waitForFlushes(long timeout) {
1749    synchronized (writestate) {
1750      if (this.writestate.readOnly) {
1751        // we should not wait for replayed flushed if we are read only (for example in case the
1752        // region is a secondary replica).
1753        return true;
1754      }
1755      if (!writestate.flushing) return true;
1756      long start = System.currentTimeMillis();
1757      long duration = 0;
1758      boolean interrupted = false;
1759      LOG.debug("waiting for cache flush to complete for region " + this);
1760      try {
1761        while (writestate.flushing) {
1762          if (timeout > 0 && duration >= timeout) break;
1763          try {
1764            long toWait = timeout == 0 ? 0 : (timeout - duration);
1765            writestate.wait(toWait);
1766          } catch (InterruptedException iex) {
1767            // essentially ignore and propagate the interrupt back up
1768            LOG.warn("Interrupted while waiting");
1769            interrupted = true;
1770            break;
1771          } finally {
1772            duration = System.currentTimeMillis() - start;
1773          }
1774        }
1775      } finally {
1776        if (interrupted) {
1777          Thread.currentThread().interrupt();
1778        }
1779      }
1780      LOG.debug("Waited " + duration + " ms for flush to complete");
1781      return !(writestate.flushing);
1782    }
1783  }
1784
1785  protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1786      final String threadNamePrefix) {
1787    int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount());
1788    int maxThreads = Math.min(numStores,
1789        conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1790            HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1791    return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1792  }
1793
1794  protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1795      final String threadNamePrefix) {
1796    int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount());
1797    int maxThreads = Math.max(1,
1798        conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1799            HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1800            / numStores);
1801    return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1802  }
1803
1804  static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1805      final String threadNamePrefix) {
1806    return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1807      new ThreadFactory() {
1808        private int count = 1;
1809
1810        @Override
1811        public Thread newThread(Runnable r) {
1812          return new Thread(r, threadNamePrefix + "-" + count++);
1813        }
1814      });
1815  }
1816
1817   /**
1818    * @return True if its worth doing a flush before we put up the close flag.
1819    */
1820  private boolean worthPreFlushing() {
1821    return this.memStoreSizing.getDataSize() >
1822      this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1823  }
1824
1825  //////////////////////////////////////////////////////////////////////////////
1826  // HRegion accessors
1827  //////////////////////////////////////////////////////////////////////////////
1828
1829  @Override
1830  public TableDescriptor getTableDescriptor() {
1831    return this.htableDescriptor;
1832  }
1833
1834  @VisibleForTesting
1835  void setTableDescriptor(TableDescriptor desc) {
1836    htableDescriptor = desc;
1837  }
1838
1839  /** @return WAL in use for this region */
1840  public WAL getWAL() {
1841    return this.wal;
1842  }
1843
1844  /**
1845   * @return split policy for this region.
1846   */
1847  public RegionSplitPolicy getSplitPolicy() {
1848    return this.splitPolicy;
1849  }
1850
1851  /**
1852   * A split takes the config from the parent region & passes it to the daughter
1853   * region's constructor. If 'conf' was passed, you would end up using the HTD
1854   * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1855   * to the daughter regions to avoid this tricky dedupe problem.
1856   * @return Configuration object
1857   */
1858  Configuration getBaseConf() {
1859    return this.baseConf;
1860  }
1861
1862  /** @return {@link FileSystem} being used by this region */
1863  public FileSystem getFilesystem() {
1864    return fs.getFileSystem();
1865  }
1866
1867  /** @return the {@link HRegionFileSystem} used by this region */
1868  public HRegionFileSystem getRegionFileSystem() {
1869    return this.fs;
1870  }
1871
1872  /** @return the WAL {@link HRegionFileSystem} used by this region */
1873  HRegionFileSystem getRegionWALFileSystem() throws IOException {
1874    return new HRegionFileSystem(conf, getWalFileSystem(),
1875        FSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo());
1876  }
1877
1878  /** @return the WAL {@link FileSystem} being used by this region */
1879  FileSystem getWalFileSystem() throws IOException {
1880    if (walFS == null) {
1881      walFS = FSUtils.getWALFileSystem(conf);
1882    }
1883    return walFS;
1884  }
1885
1886  /**
1887   * @return the Region directory under WALRootDirectory
1888   * @throws IOException if there is an error getting WALRootDir
1889   */
1890  @VisibleForTesting
1891  public Path getWALRegionDir() throws IOException {
1892    if (regionDir == null) {
1893      regionDir = FSUtils.getWALRegionDir(conf, getRegionInfo().getTable(),
1894          getRegionInfo().getEncodedName());
1895    }
1896    return regionDir;
1897  }
1898
1899  @Override
1900  public long getEarliestFlushTimeForAllStores() {
1901    return Collections.min(lastStoreFlushTimeMap.values());
1902  }
1903
1904  @Override
1905  public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException {
1906    long result = Long.MAX_VALUE;
1907    for (HStore store : stores.values()) {
1908      Collection<HStoreFile> storeFiles = store.getStorefiles();
1909      if (storeFiles == null) {
1910        continue;
1911      }
1912      for (HStoreFile file : storeFiles) {
1913        StoreFileReader sfReader = file.getReader();
1914        if (sfReader == null) {
1915          continue;
1916        }
1917        HFile.Reader reader = sfReader.getHFileReader();
1918        if (reader == null) {
1919          continue;
1920        }
1921        if (majorCompactionOnly) {
1922          byte[] val = reader.loadFileInfo().get(MAJOR_COMPACTION_KEY);
1923          if (val == null || !Bytes.toBoolean(val)) {
1924            continue;
1925          }
1926        }
1927        result = Math.min(result, reader.getFileContext().getFileCreateTime());
1928      }
1929    }
1930    return result == Long.MAX_VALUE ? 0 : result;
1931  }
1932
1933  RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
1934    long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
1935    byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
1936    regionLoadBldr.clearStoreCompleteSequenceId();
1937    for (byte[] familyName : this.stores.keySet()) {
1938      long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName);
1939      // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will
1940      // give us a sequence id that is for sure flushed. We want edit replay to start after this
1941      // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id.
1942      long csid = (earliest == HConstants.NO_SEQNUM)? lastFlushOpSeqIdLocal: earliest - 1;
1943      regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder()
1944          .setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build());
1945    }
1946    return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId());
1947  }
1948
1949  //////////////////////////////////////////////////////////////////////////////
1950  // HRegion maintenance.
1951  //
1952  // These methods are meant to be called periodically by the HRegionServer for
1953  // upkeep.
1954  //////////////////////////////////////////////////////////////////////////////
1955  /**
1956   * Do preparation for pending compaction.
1957   * @throws IOException
1958   */
1959  protected void doRegionCompactionPrep() throws IOException {
1960  }
1961
1962  /**
1963   * Synchronously compact all stores in the region.
1964   * <p>This operation could block for a long time, so don't call it from a
1965   * time-sensitive thread.
1966   * <p>Note that no locks are taken to prevent possible conflicts between
1967   * compaction and splitting activities. The regionserver does not normally compact
1968   * and split in parallel. However by calling this method you may introduce
1969   * unexpected and unhandled concurrency. Don't do this unless you know what
1970   * you are doing.
1971   *
1972   * @param majorCompaction True to force a major compaction regardless of thresholds
1973   * @throws IOException
1974   */
1975  public void compact(boolean majorCompaction) throws IOException {
1976    if (majorCompaction) {
1977      stores.values().forEach(HStore::triggerMajorCompaction);
1978    }
1979    for (HStore s : stores.values()) {
1980      Optional<CompactionContext> compaction = s.requestCompaction();
1981      if (compaction.isPresent()) {
1982        ThroughputController controller = null;
1983        if (rsServices != null) {
1984          controller = CompactionThroughputControllerFactory.create(rsServices, conf);
1985        }
1986        if (controller == null) {
1987          controller = NoLimitThroughputController.INSTANCE;
1988        }
1989        compact(compaction.get(), s, controller, null);
1990      }
1991    }
1992  }
1993
1994  /**
1995   * This is a helper function that compact all the stores synchronously.
1996   * <p>
1997   * It is used by utilities and testing
1998   */
1999  @VisibleForTesting
2000  public void compactStores() throws IOException {
2001    for (HStore s : stores.values()) {
2002      Optional<CompactionContext> compaction = s.requestCompaction();
2003      if (compaction.isPresent()) {
2004        compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null);
2005      }
2006    }
2007  }
2008
2009  /**
2010   * This is a helper function that compact the given store.
2011   * <p>
2012   * It is used by utilities and testing
2013   */
2014  @VisibleForTesting
2015  void compactStore(byte[] family, ThroughputController throughputController) throws IOException {
2016    HStore s = getStore(family);
2017    Optional<CompactionContext> compaction = s.requestCompaction();
2018    if (compaction.isPresent()) {
2019      compact(compaction.get(), s, throughputController, null);
2020    }
2021  }
2022
2023  /**
2024   * Called by compaction thread and after region is opened to compact the
2025   * HStores if necessary.
2026   *
2027   * <p>This operation could block for a long time, so don't call it from a
2028   * time-sensitive thread.
2029   *
2030   * Note that no locking is necessary at this level because compaction only
2031   * conflicts with a region split, and that cannot happen because the region
2032   * server does them sequentially and not in parallel.
2033   *
2034   * @param compaction Compaction details, obtained by requestCompaction()
2035   * @param throughputController
2036   * @return whether the compaction completed
2037   */
2038  public boolean compact(CompactionContext compaction, HStore store,
2039      ThroughputController throughputController) throws IOException {
2040    return compact(compaction, store, throughputController, null);
2041  }
2042
2043  public boolean compact(CompactionContext compaction, HStore store,
2044      ThroughputController throughputController, User user) throws IOException {
2045    assert compaction != null && compaction.hasSelection();
2046    assert !compaction.getRequest().getFiles().isEmpty();
2047    if (this.closing.get() || this.closed.get()) {
2048      LOG.debug("Skipping compaction on " + this + " because closing/closed");
2049      store.cancelRequestedCompaction(compaction);
2050      return false;
2051    }
2052    MonitoredTask status = null;
2053    boolean requestNeedsCancellation = true;
2054    /*
2055     * We are trying to remove / relax the region read lock for compaction.
2056     * Let's see what are the potential race conditions among the operations (user scan,
2057     * region split, region close and region bulk load).
2058     *
2059     *  user scan ---> region read lock
2060     *  region split --> region close first --> region write lock
2061     *  region close --> region write lock
2062     *  region bulk load --> region write lock
2063     *
2064     * read lock is compatible with read lock. ---> no problem with user scan/read
2065     * region bulk load does not cause problem for compaction (no consistency problem, store lock
2066     *  will help the store file accounting).
2067     * They can run almost concurrently at the region level.
2068     *
2069     * The only remaining race condition is between the region close and compaction.
2070     * So we will evaluate, below, how region close intervenes with compaction if compaction does
2071     * not acquire region read lock.
2072     *
2073     * Here are the steps for compaction:
2074     * 1. obtain list of StoreFile's
2075     * 2. create StoreFileScanner's based on list from #1
2076     * 3. perform compaction and save resulting files under tmp dir
2077     * 4. swap in compacted files
2078     *
2079     * #1 is guarded by store lock. This patch does not change this --> no worse or better
2080     * For #2, we obtain smallest read point (for region) across all the Scanners (for both default
2081     * compactor and stripe compactor).
2082     * The read points are for user scans. Region keeps the read points for all currently open
2083     * user scanners.
2084     * Compaction needs to know the smallest read point so that during re-write of the hfiles,
2085     * it can remove the mvcc points for the cells if their mvccs are older than the smallest
2086     * since they are not needed anymore.
2087     * This will not conflict with compaction.
2088     * For #3, it can be performed in parallel to other operations.
2089     * For #4 bulk load and compaction don't conflict with each other on the region level
2090     *   (for multi-family atomicy).
2091     * Region close and compaction are guarded pretty well by the 'writestate'.
2092     * In HRegion#doClose(), we have :
2093     * synchronized (writestate) {
2094     *   // Disable compacting and flushing by background threads for this
2095     *   // region.
2096     *   canFlush = !writestate.readOnly;
2097     *   writestate.writesEnabled = false;
2098     *   LOG.debug("Closing " + this + ": disabling compactions & flushes");
2099     *   waitForFlushesAndCompactions();
2100     * }
2101     * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0.
2102     * and in HRegion.compact()
2103     *  try {
2104     *    synchronized (writestate) {
2105     *    if (writestate.writesEnabled) {
2106     *      wasStateSet = true;
2107     *      ++writestate.compacting;
2108     *    } else {
2109     *      String msg = "NOT compacting region " + this + ". Writes disabled.";
2110     *      LOG.info(msg);
2111     *      status.abort(msg);
2112     *      return false;
2113     *    }
2114     *  }
2115     * Also in compactor.performCompaction():
2116     * check periodically to see if a system stop is requested
2117     * if (closeCheckInterval > 0) {
2118     *   bytesWritten += len;
2119     *   if (bytesWritten > closeCheckInterval) {
2120     *     bytesWritten = 0;
2121     *     if (!store.areWritesEnabled()) {
2122     *       progress.cancel();
2123     *       return false;
2124     *     }
2125     *   }
2126     * }
2127     */
2128    try {
2129      byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
2130      if (stores.get(cf) != store) {
2131        LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
2132            + " has been re-instantiated, cancel this compaction request. "
2133            + " It may be caused by the roll back of split transaction");
2134        return false;
2135      }
2136
2137      status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
2138      status.enableStatusJournal(false);
2139      if (this.closed.get()) {
2140        String msg = "Skipping compaction on " + this + " because closed";
2141        LOG.debug(msg);
2142        status.abort(msg);
2143        return false;
2144      }
2145      boolean wasStateSet = false;
2146      try {
2147        synchronized (writestate) {
2148          if (writestate.writesEnabled) {
2149            wasStateSet = true;
2150            writestate.compacting.incrementAndGet();
2151          } else {
2152            String msg = "NOT compacting region " + this + ". Writes disabled.";
2153            LOG.info(msg);
2154            status.abort(msg);
2155            return false;
2156          }
2157        }
2158        LOG.info("Starting compaction of {} in {}{}", store, this,
2159            (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
2160        doRegionCompactionPrep();
2161        try {
2162          status.setStatus("Compacting store " + store);
2163          // We no longer need to cancel the request on the way out of this
2164          // method because Store#compact will clean up unconditionally
2165          requestNeedsCancellation = false;
2166          store.compact(compaction, throughputController, user);
2167        } catch (InterruptedIOException iioe) {
2168          String msg = "compaction interrupted";
2169          LOG.info(msg, iioe);
2170          status.abort(msg);
2171          return false;
2172        }
2173      } finally {
2174        if (wasStateSet) {
2175          synchronized (writestate) {
2176            writestate.compacting.decrementAndGet();
2177            if (writestate.compacting.get() <= 0) {
2178              writestate.notifyAll();
2179            }
2180          }
2181        }
2182      }
2183      status.markComplete("Compaction complete");
2184      return true;
2185    } finally {
2186      if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
2187      if (status != null) {
2188        LOG.debug("Compaction status journal:\n\t" + status.prettyPrintJournal());
2189        status.cleanup();
2190      }
2191    }
2192  }
2193
2194  /**
2195   * Flush the cache.
2196   *
2197   * <p>When this method is called the cache will be flushed unless:
2198   * <ol>
2199   *   <li>the cache is empty</li>
2200   *   <li>the region is closed.</li>
2201   *   <li>a flush is already in progress</li>
2202   *   <li>writes are disabled</li>
2203   * </ol>
2204   *
2205   * <p>This method may block for some time, so it should not be called from a
2206   * time-sensitive thread.
2207   * @param force whether we want to force a flush of all stores
2208   * @return FlushResult indicating whether the flush was successful or not and if
2209   * the region needs compacting
2210   *
2211   * @throws IOException general io exceptions
2212   * because a snapshot was not properly persisted.
2213   */
2214  // TODO HBASE-18905. We might have to expose a requestFlush API for CPs
2215  public FlushResult flush(boolean force) throws IOException {
2216    return flushcache(force, false, FlushLifeCycleTracker.DUMMY);
2217  }
2218
2219  public interface FlushResult {
2220    enum Result {
2221      FLUSHED_NO_COMPACTION_NEEDED,
2222      FLUSHED_COMPACTION_NEEDED,
2223      // Special case where a flush didn't run because there's nothing in the memstores. Used when
2224      // bulk loading to know when we can still load even if a flush didn't happen.
2225      CANNOT_FLUSH_MEMSTORE_EMPTY,
2226      CANNOT_FLUSH
2227    }
2228
2229    /** @return the detailed result code */
2230    Result getResult();
2231
2232    /** @return true if the memstores were flushed, else false */
2233    boolean isFlushSucceeded();
2234
2235    /** @return True if the flush requested a compaction, else false */
2236    boolean isCompactionNeeded();
2237  }
2238
2239  /**
2240   * Flush the cache.
2241   *
2242   * When this method is called the cache will be flushed unless:
2243   * <ol>
2244   *   <li>the cache is empty</li>
2245   *   <li>the region is closed.</li>
2246   *   <li>a flush is already in progress</li>
2247   *   <li>writes are disabled</li>
2248   * </ol>
2249   *
2250   * <p>This method may block for some time, so it should not be called from a
2251   * time-sensitive thread.
2252   * @param forceFlushAllStores whether we want to flush all stores
2253   * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
2254   * @param tracker used to track the life cycle of this flush
2255   * @return whether the flush is success and whether the region needs compacting
2256   *
2257   * @throws IOException general io exceptions
2258   * @throws DroppedSnapshotException Thrown when replay of wal is required
2259   * because a Snapshot was not properly persisted. The region is put in closing mode, and the
2260   * caller MUST abort after this.
2261   */
2262  public FlushResultImpl flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker,
2263      FlushLifeCycleTracker tracker) throws IOException {
2264    // fail-fast instead of waiting on the lock
2265    if (this.closing.get()) {
2266      String msg = "Skipping flush on " + this + " because closing";
2267      LOG.debug(msg);
2268      return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2269    }
2270    MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
2271    status.enableStatusJournal(false);
2272    status.setStatus("Acquiring readlock on region");
2273    // block waiting for the lock for flushing cache
2274    lock.readLock().lock();
2275    try {
2276      if (this.closed.get()) {
2277        String msg = "Skipping flush on " + this + " because closed";
2278        LOG.debug(msg);
2279        status.abort(msg);
2280        return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2281      }
2282      if (coprocessorHost != null) {
2283        status.setStatus("Running coprocessor pre-flush hooks");
2284        coprocessorHost.preFlush(tracker);
2285      }
2286      // TODO: this should be managed within memstore with the snapshot, updated only after flush
2287      // successful
2288      if (numMutationsWithoutWAL.sum() > 0) {
2289        numMutationsWithoutWAL.reset();
2290        dataInMemoryWithoutWAL.reset();
2291      }
2292      synchronized (writestate) {
2293        if (!writestate.flushing && writestate.writesEnabled) {
2294          this.writestate.flushing = true;
2295        } else {
2296          if (LOG.isDebugEnabled()) {
2297            LOG.debug("NOT flushing memstore for region " + this
2298                + ", flushing=" + writestate.flushing + ", writesEnabled="
2299                + writestate.writesEnabled);
2300          }
2301          String msg = "Not flushing since "
2302              + (writestate.flushing ? "already flushing"
2303              : "writes not enabled");
2304          status.abort(msg);
2305          return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2306        }
2307      }
2308
2309      try {
2310        Collection<HStore> specificStoresToFlush =
2311            forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
2312        FlushResultImpl fs =
2313            internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker);
2314
2315        if (coprocessorHost != null) {
2316          status.setStatus("Running post-flush coprocessor hooks");
2317          coprocessorHost.postFlush(tracker);
2318        }
2319
2320        if(fs.isFlushSucceeded()) {
2321          flushesQueued.reset();
2322        }
2323
2324        status.markComplete("Flush successful");
2325        return fs;
2326      } finally {
2327        synchronized (writestate) {
2328          writestate.flushing = false;
2329          this.writestate.flushRequested = false;
2330          writestate.notifyAll();
2331        }
2332      }
2333    } finally {
2334      lock.readLock().unlock();
2335      LOG.debug("Flush status journal:\n\t" + status.prettyPrintJournal());
2336      status.cleanup();
2337    }
2338  }
2339
2340  /**
2341   * Should the store be flushed because it is old enough.
2342   * <p>
2343   * Every FlushPolicy should call this to determine whether a store is old enough to flush (except
2344   * that you always flush all stores). Otherwise the method will always
2345   * returns true which will make a lot of flush requests.
2346   */
2347  boolean shouldFlushStore(HStore store) {
2348    long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(),
2349      store.getColumnFamilyDescriptor().getName()) - 1;
2350    if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) {
2351      if (LOG.isDebugEnabled()) {
2352        LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " +
2353          getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest +
2354          " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint());
2355      }
2356      return true;
2357    }
2358    if (this.flushCheckInterval <= 0) {
2359      return false;
2360    }
2361    long now = EnvironmentEdgeManager.currentTime();
2362    if (store.timeOfOldestEdit() < now - this.flushCheckInterval) {
2363      if (LOG.isDebugEnabled()) {
2364        LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " +
2365          getRegionInfo().getEncodedName() + " because time of oldest edit=" +
2366            store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now);
2367      }
2368      return true;
2369    }
2370    return false;
2371  }
2372
2373  /**
2374   * Should the memstore be flushed now
2375   */
2376  boolean shouldFlush(final StringBuilder whyFlush) {
2377    whyFlush.setLength(0);
2378    // This is a rough measure.
2379    if (this.maxFlushedSeqId > 0
2380          && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint())) {
2381      whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush");
2382      return true;
2383    }
2384    long modifiedFlushCheckInterval = flushCheckInterval;
2385    if (getRegionInfo().getTable().isSystemTable() &&
2386        getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) {
2387      modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL;
2388    }
2389    if (modifiedFlushCheckInterval <= 0) { //disabled
2390      return false;
2391    }
2392    long now = EnvironmentEdgeManager.currentTime();
2393    //if we flushed in the recent past, we don't need to do again now
2394    if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
2395      return false;
2396    }
2397    //since we didn't flush in the recent past, flush now if certain conditions
2398    //are met. Return true on first such memstore hit.
2399    for (HStore s : stores.values()) {
2400      if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
2401        // we have an old enough edit in the memstore, flush
2402        whyFlush.append(s.toString() + " has an old edit so flush to free WALs");
2403        return true;
2404      }
2405    }
2406    return false;
2407  }
2408
2409  /**
2410   * Flushing all stores.
2411   * @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker)
2412   */
2413  private FlushResult internalFlushcache(MonitoredTask status) throws IOException {
2414    return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY);
2415  }
2416
2417  /**
2418   * Flushing given stores.
2419   * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker)
2420   */
2421  private FlushResultImpl internalFlushcache(Collection<HStore> storesToFlush, MonitoredTask status,
2422      boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException {
2423    return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status,
2424      writeFlushWalMarker, tracker);
2425  }
2426
2427  /**
2428   * Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the
2429   * memstore, all of which have also been written to the wal. We need to write those updates in the
2430   * memstore out to disk, while being able to process reads/writes as much as possible during the
2431   * flush operation.
2432   * <p>
2433   * This method may block for some time. Every time you call it, we up the regions sequence id even
2434   * if we don't flush; i.e. the returned region id will be at least one larger than the last edit
2435   * applied to this region. The returned id does not refer to an actual edit. The returned id can
2436   * be used for say installing a bulk loaded file just ahead of the last hfile that was the result
2437   * of this flush, etc.
2438   * @param wal Null if we're NOT to go via wal.
2439   * @param myseqid The seqid to use if <code>wal</code> is null writing out flush file.
2440   * @param storesToFlush The list of stores to flush.
2441   * @return object describing the flush's state
2442   * @throws IOException general io exceptions
2443   * @throws DroppedSnapshotException Thrown when replay of WAL is required.
2444   */
2445  protected FlushResultImpl internalFlushcache(WAL wal, long myseqid,
2446      Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker,
2447      FlushLifeCycleTracker tracker) throws IOException {
2448    PrepareFlushResult result =
2449        internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker);
2450    if (result.result == null) {
2451      return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
2452    } else {
2453      return result.result; // early exit due to failure from prepare stage
2454    }
2455  }
2456
2457  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DLS_DEAD_LOCAL_STORE",
2458      justification="FindBugs seems confused about trxId")
2459  protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid,
2460      Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker,
2461      FlushLifeCycleTracker tracker) throws IOException {
2462    if (this.rsServices != null && this.rsServices.isAborted()) {
2463      // Don't flush when server aborting, it's unsafe
2464      throw new IOException("Aborting flush because server is aborted...");
2465    }
2466    final long startTime = EnvironmentEdgeManager.currentTime();
2467    // If nothing to flush, return, but return with a valid unused sequenceId.
2468    // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a
2469    // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs
2470    // to no other that it can use to associate with the bulk load. Hence this little dance below
2471    // to go get one.
2472    if (this.memStoreSizing.getDataSize() <= 0) {
2473      // Take an update lock so no edits can come into memory just yet.
2474      this.updatesLock.writeLock().lock();
2475      WriteEntry writeEntry = null;
2476      try {
2477        if (this.memStoreSizing.getDataSize() <= 0) {
2478          // Presume that if there are still no edits in the memstore, then there are no edits for
2479          // this region out in the WAL subsystem so no need to do any trickery clearing out
2480          // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for
2481          // sure just beyond the last appended region edit and not associated with any edit
2482          // (useful as marker when bulk loading, etc.).
2483          if (wal != null) {
2484            writeEntry = mvcc.begin();
2485            long flushOpSeqId = writeEntry.getWriteNumber();
2486            FlushResultImpl flushResult =
2487                new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId,
2488                    "Nothing to flush", writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
2489            mvcc.completeAndWait(writeEntry);
2490            // Set to null so we don't complete it again down in finally block.
2491            writeEntry = null;
2492            return new PrepareFlushResult(flushResult, myseqid);
2493          } else {
2494            return new PrepareFlushResult(new FlushResultImpl(
2495              FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid);
2496          }
2497        }
2498      } finally {
2499        if (writeEntry != null) {
2500          // If writeEntry is non-null, this operation failed; the mvcc transaction failed...
2501          // but complete it anyways so it doesn't block the mvcc queue.
2502          mvcc.complete(writeEntry);
2503        }
2504        this.updatesLock.writeLock().unlock();
2505      }
2506    }
2507    logFatLineOnFlush(storesToFlush, myseqid);
2508    // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
2509    // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
2510    // allow updates again so its value will represent the size of the updates received
2511    // during flush
2512
2513    // We have to take an update lock during snapshot, or else a write could end up in both snapshot
2514    // and memstore (makes it difficult to do atomic rows then)
2515    status.setStatus("Obtaining lock to block concurrent updates");
2516    // block waiting for the lock for internal flush
2517    this.updatesLock.writeLock().lock();
2518    status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName());
2519    MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing();
2520
2521    Map<byte[], Long> flushedFamilyNamesToSeq = new HashMap<>();
2522    for (HStore store : storesToFlush) {
2523      flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(),
2524        store.preFlushSeqIDEstimation());
2525    }
2526
2527    TreeMap<byte[], StoreFlushContext> storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR);
2528    TreeMap<byte[], List<Path>> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
2529    TreeMap<byte[], MemStoreSize> storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR);
2530    // The sequence id of this flush operation which is used to log FlushMarker and pass to
2531    // createFlushContext to use as the store file's sequence id. It can be in advance of edits
2532    // still in the memstore, edits that are in other column families yet to be flushed.
2533    long flushOpSeqId = HConstants.NO_SEQNUM;
2534    // The max flushed sequence id after this flush operation completes. All edits in memstore
2535    // will be in advance of this sequence id.
2536    long flushedSeqId = HConstants.NO_SEQNUM;
2537    byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
2538    try {
2539      if (wal != null) {
2540        Long earliestUnflushedSequenceIdForTheRegion =
2541            wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq);
2542        if (earliestUnflushedSequenceIdForTheRegion == null) {
2543          // This should never happen. This is how startCacheFlush signals flush cannot proceed.
2544          String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
2545          status.setStatus(msg);
2546          return new PrepareFlushResult(
2547              new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
2548              myseqid);
2549        }
2550        flushOpSeqId = getNextSequenceId(wal);
2551        // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit
2552        flushedSeqId =
2553            earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM?
2554                flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
2555      } else {
2556        // use the provided sequence Id as WAL is not being used for this flush.
2557        flushedSeqId = flushOpSeqId = myseqid;
2558      }
2559
2560      for (HStore s : storesToFlush) {
2561        storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(),
2562          s.createFlushContext(flushOpSeqId, tracker));
2563        // for writing stores to WAL
2564        committedFiles.put(s.getColumnFamilyDescriptor().getName(), null);
2565      }
2566
2567      // write the snapshot start to WAL
2568      if (wal != null && !writestate.readOnly) {
2569        FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
2570            getRegionInfo(), flushOpSeqId, committedFiles);
2571        // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH
2572        WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
2573            mvcc);
2574      }
2575
2576      // Prepare flush (take a snapshot)
2577      storeFlushCtxs.forEach((name, flush) -> {
2578        MemStoreSize snapshotSize = flush.prepare();
2579        totalSizeOfFlushableStores.incMemStoreSize(snapshotSize);
2580        storeFlushableSize.put(name, snapshotSize);
2581      });
2582    } catch (IOException ex) {
2583      doAbortFlushToWAL(wal, flushOpSeqId, committedFiles);
2584      throw ex;
2585    } finally {
2586      this.updatesLock.writeLock().unlock();
2587    }
2588    String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " +
2589        "flushsize=" + totalSizeOfFlushableStores;
2590    status.setStatus(s);
2591    doSyncOfUnflushedWALChanges(wal, getRegionInfo());
2592    return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
2593        flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores);
2594  }
2595
2596  /**
2597   * Utility method broken out of internalPrepareFlushCache so that method is smaller.
2598   */
2599  private void logFatLineOnFlush(Collection<HStore> storesToFlush, long sequenceId) {
2600    if (!LOG.isInfoEnabled()) {
2601      return;
2602    }
2603    // Log a fat line detailing what is being flushed.
2604    StringBuilder perCfExtras = null;
2605    if (!isAllFamilies(storesToFlush)) {
2606      perCfExtras = new StringBuilder();
2607      for (HStore store: storesToFlush) {
2608        MemStoreSize mss = store.getFlushableSize();
2609        perCfExtras.append("; ").append(store.getColumnFamilyName());
2610        perCfExtras.append("={dataSize=")
2611            .append(StringUtils.byteDesc(mss.getDataSize()));
2612        perCfExtras.append(", heapSize=")
2613            .append(StringUtils.byteDesc(mss.getHeapSize()));
2614        perCfExtras.append(", offHeapSize=")
2615            .append(StringUtils.byteDesc(mss.getOffHeapSize()));
2616        perCfExtras.append("}");
2617      }
2618    }
2619    MemStoreSize mss = this.memStoreSizing.getMemStoreSize();
2620    LOG.info("Flushing " + storesToFlush.size() + "/" + stores.size() + " column families," +
2621        " dataSize=" + StringUtils.byteDesc(mss.getDataSize()) +
2622        " heapSize=" + StringUtils.byteDesc(mss.getHeapSize()) +
2623        ((perCfExtras != null && perCfExtras.length() > 0)? perCfExtras.toString(): "") +
2624        ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId));
2625  }
2626
2627  private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId,
2628      final Map<byte[], List<Path>> committedFiles) {
2629    if (wal == null) return;
2630    try {
2631      FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2632          getRegionInfo(), flushOpSeqId, committedFiles);
2633      WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
2634          mvcc);
2635    } catch (Throwable t) {
2636      LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
2637          StringUtils.stringifyException(t));
2638      // ignore this since we will be aborting the RS with DSE.
2639    }
2640    // we have called wal.startCacheFlush(), now we have to abort it
2641    wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2642  }
2643
2644  /**
2645   * Sync unflushed WAL changes. See HBASE-8208 for details
2646   */
2647  private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri)
2648  throws IOException {
2649    if (wal == null) {
2650      return;
2651    }
2652    try {
2653      wal.sync(); // ensure that flush marker is sync'ed
2654    } catch (IOException ioe) {
2655      wal.abortCacheFlush(hri.getEncodedNameAsBytes());
2656      throw ioe;
2657    }
2658  }
2659
2660  /**
2661   * @return True if passed Set is all families in the region.
2662   */
2663  private boolean isAllFamilies(Collection<HStore> families) {
2664    return families == null || this.stores.size() == families.size();
2665  }
2666
2667  /**
2668   * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
2669   * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
2670   * @param wal
2671   * @return whether WAL write was successful
2672   */
2673  private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
2674    if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
2675      FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
2676        getRegionInfo(), -1, new TreeMap<>(Bytes.BYTES_COMPARATOR));
2677      try {
2678        WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
2679            mvcc);
2680        return true;
2681      } catch (IOException e) {
2682        LOG.warn(getRegionInfo().getEncodedName() + " : "
2683            + "Received exception while trying to write the flush request to wal", e);
2684      }
2685    }
2686    return false;
2687  }
2688
2689  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
2690      justification="Intentional; notify is about completed flush")
2691  protected FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status,
2692      PrepareFlushResult prepareResult, Collection<HStore> storesToFlush) throws IOException {
2693    // prepare flush context is carried via PrepareFlushResult
2694    TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
2695    TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
2696    long startTime = prepareResult.startTime;
2697    long flushOpSeqId = prepareResult.flushOpSeqId;
2698    long flushedSeqId = prepareResult.flushedSeqId;
2699
2700    String s = "Flushing stores of " + this;
2701    status.setStatus(s);
2702    if (LOG.isTraceEnabled()) LOG.trace(s);
2703
2704    // Any failure from here on out will be catastrophic requiring server
2705    // restart so wal content can be replayed and put back into the memstore.
2706    // Otherwise, the snapshot content while backed up in the wal, it will not
2707    // be part of the current running servers state.
2708    boolean compactionRequested = false;
2709    long flushedOutputFileSize = 0;
2710    try {
2711      // A.  Flush memstore to all the HStores.
2712      // Keep running vector of all store files that includes both old and the
2713      // just-made new flush store file. The new flushed file is still in the
2714      // tmp directory.
2715
2716      for (StoreFlushContext flush : storeFlushCtxs.values()) {
2717        flush.flushCache(status);
2718      }
2719
2720      // Switch snapshot (in memstore) -> new hfile (thus causing
2721      // all the store scanners to reset/reseek).
2722      Iterator<HStore> it = storesToFlush.iterator();
2723      // stores.values() and storeFlushCtxs have same order
2724      for (StoreFlushContext flush : storeFlushCtxs.values()) {
2725        boolean needsCompaction = flush.commit(status);
2726        if (needsCompaction) {
2727          compactionRequested = true;
2728        }
2729        byte[] storeName = it.next().getColumnFamilyDescriptor().getName();
2730        List<Path> storeCommittedFiles = flush.getCommittedFiles();
2731        committedFiles.put(storeName, storeCommittedFiles);
2732        // Flush committed no files, indicating flush is empty or flush was canceled
2733        if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) {
2734          MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName);
2735          prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize);
2736        }
2737        flushedOutputFileSize += flush.getOutputFileSize();
2738      }
2739      storeFlushCtxs.clear();
2740
2741      // Set down the memstore size by amount of flush.
2742      MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize();
2743      this.decrMemStoreSize(mss);
2744
2745      if (wal != null) {
2746        // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
2747        FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
2748          getRegionInfo(), flushOpSeqId, committedFiles);
2749        WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
2750            mvcc);
2751      }
2752    } catch (Throwable t) {
2753      // An exception here means that the snapshot was not persisted.
2754      // The wal needs to be replayed so its content is restored to memstore.
2755      // Currently, only a server restart will do this.
2756      // We used to only catch IOEs but its possible that we'd get other
2757      // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
2758      // all and sundry.
2759      if (wal != null) {
2760        try {
2761          FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2762            getRegionInfo(), flushOpSeqId, committedFiles);
2763          WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc);
2764        } catch (Throwable ex) {
2765          LOG.warn(getRegionInfo().getEncodedName() + " : "
2766              + "failed writing ABORT_FLUSH marker to WAL", ex);
2767          // ignore this since we will be aborting the RS with DSE.
2768        }
2769        wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2770      }
2771      DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
2772          Bytes.toStringBinary(getRegionInfo().getRegionName()));
2773      dse.initCause(t);
2774      status.abort("Flush failed: " + StringUtils.stringifyException(t));
2775
2776      // Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
2777      // However, since we may have the region read lock, we cannot call close(true) here since
2778      // we cannot promote to a write lock. Instead we are setting closing so that all other region
2779      // operations except for close will be rejected.
2780      this.closing.set(true);
2781
2782      if (rsServices != null) {
2783        // This is a safeguard against the case where the caller fails to explicitly handle aborting
2784        rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
2785      }
2786
2787      throw dse;
2788    }
2789
2790    // If we get to here, the HStores have been written.
2791    if (wal != null) {
2792      wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2793    }
2794
2795    // Record latest flush time
2796    for (HStore store: storesToFlush) {
2797      this.lastStoreFlushTimeMap.put(store, startTime);
2798    }
2799
2800    this.maxFlushedSeqId = flushedSeqId;
2801    this.lastFlushOpSeqId = flushOpSeqId;
2802
2803    // C. Finally notify anyone waiting on memstore to clear:
2804    // e.g. checkResources().
2805    synchronized (this) {
2806      notifyAll(); // FindBugs NN_NAKED_NOTIFY
2807    }
2808
2809    long time = EnvironmentEdgeManager.currentTime() - startTime;
2810    MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize();
2811    long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize();
2812    String msg = "Finished flush of"
2813        + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/" + mss.getDataSize()
2814        + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/" + mss.getHeapSize()
2815        + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
2816        + " for " + this.getRegionInfo().getEncodedName() + " in " + time + "ms, sequenceid="
2817        + flushOpSeqId +  ", compaction requested=" + compactionRequested
2818        + ((wal == null) ? "; wal=null" : "");
2819    LOG.info(msg);
2820    status.setStatus(msg);
2821
2822    if (rsServices != null && rsServices.getMetrics() != null) {
2823      rsServices.getMetrics().updateFlush(time,
2824          mss.getDataSize(), flushedOutputFileSize);
2825    }
2826
2827    return new FlushResultImpl(compactionRequested ?
2828        FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
2829          FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
2830  }
2831
2832  /**
2833   * Method to safely get the next sequence number.
2834   * @return Next sequence number unassociated with any actual edit.
2835   * @throws IOException
2836   */
2837  @VisibleForTesting
2838  protected long getNextSequenceId(final WAL wal) throws IOException {
2839    WriteEntry we = mvcc.begin();
2840    mvcc.completeAndWait(we);
2841    return we.getWriteNumber();
2842  }
2843
2844  //////////////////////////////////////////////////////////////////////////////
2845  // get() methods for client use.
2846  //////////////////////////////////////////////////////////////////////////////
2847
2848  @Override
2849  public RegionScannerImpl getScanner(Scan scan) throws IOException {
2850   return getScanner(scan, null);
2851  }
2852
2853  @Override
2854  public RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners)
2855      throws IOException {
2856    return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE);
2857  }
2858
2859  private RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners,
2860      long nonceGroup, long nonce) throws IOException {
2861    startRegionOperation(Operation.SCAN);
2862    try {
2863      // Verify families are all valid
2864      if (!scan.hasFamilies()) {
2865        // Adding all families to scanner
2866        for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) {
2867          scan.addFamily(family);
2868        }
2869      } else {
2870        for (byte[] family : scan.getFamilyMap().keySet()) {
2871          checkFamily(family);
2872        }
2873      }
2874      return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce);
2875    } finally {
2876      closeRegionOperation(Operation.SCAN);
2877    }
2878  }
2879
2880  protected RegionScanner instantiateRegionScanner(Scan scan,
2881      List<KeyValueScanner> additionalScanners) throws IOException {
2882    return instantiateRegionScanner(scan, additionalScanners, HConstants.NO_NONCE,
2883      HConstants.NO_NONCE);
2884  }
2885
2886  protected RegionScannerImpl instantiateRegionScanner(Scan scan,
2887      List<KeyValueScanner> additionalScanners, long nonceGroup, long nonce) throws IOException {
2888    if (scan.isReversed()) {
2889      if (scan.getFilter() != null) {
2890        scan.getFilter().setReversed(true);
2891      }
2892      return new ReversedRegionScannerImpl(scan, additionalScanners, this);
2893    }
2894    return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce);
2895  }
2896
2897  /**
2898   * Prepare a delete for a row mutation processor
2899   * @param delete The passed delete is modified by this method. WARNING!
2900   * @throws IOException
2901   */
2902  public void prepareDelete(Delete delete) throws IOException {
2903    // Check to see if this is a deleteRow insert
2904    if(delete.getFamilyCellMap().isEmpty()){
2905      for(byte [] family : this.htableDescriptor.getColumnFamilyNames()){
2906        // Don't eat the timestamp
2907        delete.addFamily(family, delete.getTimestamp());
2908      }
2909    } else {
2910      for(byte [] family : delete.getFamilyCellMap().keySet()) {
2911        if(family == null) {
2912          throw new NoSuchColumnFamilyException("Empty family is invalid");
2913        }
2914        checkFamily(family);
2915      }
2916    }
2917  }
2918
2919  @Override
2920  public void delete(Delete delete) throws IOException {
2921    checkReadOnly();
2922    checkResources();
2923    startRegionOperation(Operation.DELETE);
2924    try {
2925      // All edits for the given row (across all column families) must happen atomically.
2926      doBatchMutate(delete);
2927    } finally {
2928      closeRegionOperation(Operation.DELETE);
2929    }
2930  }
2931
2932  /**
2933   * Row needed by below method.
2934   */
2935  private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2936
2937  /**
2938   * This is used only by unit tests. Not required to be a public API.
2939   * @param familyMap map of family to edits for the given family.
2940   * @throws IOException
2941   */
2942  void delete(NavigableMap<byte[], List<Cell>> familyMap,
2943      Durability durability) throws IOException {
2944    Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2945    delete.setFamilyCellMap(familyMap);
2946    delete.setDurability(durability);
2947    doBatchMutate(delete);
2948  }
2949
2950  /**
2951   * Set up correct timestamps in the KVs in Delete object.
2952   * <p>Caller should have the row and region locks.
2953   * @param mutation
2954   * @param familyMap
2955   * @param byteNow
2956   * @throws IOException
2957   */
2958  public void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2959      byte[] byteNow) throws IOException {
2960    for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2961
2962      byte[] family = e.getKey();
2963      List<Cell> cells = e.getValue();
2964      assert cells instanceof RandomAccess;
2965
2966      Map<byte[], Integer> kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR);
2967      int listSize = cells.size();
2968      for (int i=0; i < listSize; i++) {
2969        Cell cell = cells.get(i);
2970        //  Check if time is LATEST, change to time of most recent addition if so
2971        //  This is expensive.
2972        if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP
2973            && PrivateCellUtil.isDeleteType(cell)) {
2974          byte[] qual = CellUtil.cloneQualifier(cell);
2975
2976          Integer count = kvCount.get(qual);
2977          if (count == null) {
2978            kvCount.put(qual, 1);
2979          } else {
2980            kvCount.put(qual, count + 1);
2981          }
2982          count = kvCount.get(qual);
2983
2984          Get get = new Get(CellUtil.cloneRow(cell));
2985          get.setMaxVersions(count);
2986          get.addColumn(family, qual);
2987          if (coprocessorHost != null) {
2988            if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2989                byteNow, get)) {
2990              updateDeleteLatestVersionTimestamp(cell, get, count, byteNow);
2991            }
2992          } else {
2993            updateDeleteLatestVersionTimestamp(cell, get, count, byteNow);
2994          }
2995        } else {
2996          PrivateCellUtil.updateLatestStamp(cell, byteNow);
2997        }
2998      }
2999    }
3000  }
3001
3002  void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow)
3003      throws IOException {
3004    List<Cell> result = get(get, false);
3005
3006    if (result.size() < count) {
3007      // Nothing to delete
3008      PrivateCellUtil.updateLatestStamp(cell, byteNow);
3009      return;
3010    }
3011    if (result.size() > count) {
3012      throw new RuntimeException("Unexpected size: " + result.size());
3013    }
3014    Cell getCell = result.get(count - 1);
3015    PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp());
3016  }
3017
3018  @Override
3019  public void put(Put put) throws IOException {
3020    checkReadOnly();
3021
3022    // Do a rough check that we have resources to accept a write.  The check is
3023    // 'rough' in that between the resource check and the call to obtain a
3024    // read lock, resources may run out.  For now, the thought is that this
3025    // will be extremely rare; we'll deal with it when it happens.
3026    checkResources();
3027    startRegionOperation(Operation.PUT);
3028    try {
3029      // All edits for the given row (across all column families) must happen atomically.
3030      doBatchMutate(put);
3031    } finally {
3032      closeRegionOperation(Operation.PUT);
3033    }
3034  }
3035
3036  /**
3037   * Class that tracks the progress of a batch operations, accumulating status codes and tracking
3038   * the index at which processing is proceeding. These batch operations may get split into
3039   * mini-batches for processing.
3040   */
3041  private abstract static class BatchOperation<T> {
3042    protected final T[] operations;
3043    protected final OperationStatus[] retCodeDetails;
3044    protected final WALEdit[] walEditsFromCoprocessors;
3045    // reference family cell maps directly so coprocessors can mutate them if desired
3046    protected final Map<byte[], List<Cell>>[] familyCellMaps;
3047
3048    protected final HRegion region;
3049    protected int nextIndexToProcess = 0;
3050    protected final ObservedExceptionsInBatch observedExceptions;
3051    //Durability of the batch (highest durability of all operations)
3052    protected Durability durability;
3053    protected boolean atomic = false;
3054
3055    public BatchOperation(final HRegion region, T[] operations) {
3056      this.operations = operations;
3057      this.retCodeDetails = new OperationStatus[operations.length];
3058      Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
3059      this.walEditsFromCoprocessors = new WALEdit[operations.length];
3060      familyCellMaps = new Map[operations.length];
3061
3062      this.region = region;
3063      observedExceptions = new ObservedExceptionsInBatch();
3064      durability = Durability.USE_DEFAULT;
3065    }
3066
3067    /**
3068     * Visitor interface for batch operations
3069     */
3070    @FunctionalInterface
3071    public interface Visitor {
3072      /**
3073       * @param index operation index
3074       * @return If true continue visiting remaining entries, break otherwise
3075       */
3076      boolean visit(int index) throws IOException;
3077    }
3078
3079    /**
3080     * Helper method for visiting pending/ all batch operations
3081     */
3082    public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor)
3083        throws IOException {
3084      assert lastIndexExclusive <= this.size();
3085      for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) {
3086        if (!pendingOnly || isOperationPending(i)) {
3087          if (!visitor.visit(i)) {
3088            break;
3089          }
3090        }
3091      }
3092    }
3093
3094    public abstract Mutation getMutation(int index);
3095
3096    public abstract long getNonceGroup(int index);
3097
3098    public abstract long getNonce(int index);
3099
3100    /**
3101     * This method is potentially expensive and useful mostly for non-replay CP path.
3102     */
3103    public abstract Mutation[] getMutationsForCoprocs();
3104
3105    public abstract boolean isInReplay();
3106
3107    public abstract long getOrigLogSeqNum();
3108
3109    public abstract void startRegionOperation() throws IOException;
3110
3111    public abstract void closeRegionOperation() throws IOException;
3112
3113    /**
3114     * Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs
3115     * CP prePut()/ preDelete() hooks for all mutations in a batch. This is intended to operate on
3116     * entire batch and will be called from outside of class to check and prepare batch. This can
3117     * be implemented by calling helper method {@link #checkAndPrepareMutation(int, long)} in a
3118     * 'for' loop over mutations.
3119     */
3120    public abstract void checkAndPrepare() throws IOException;
3121
3122    /**
3123     * Implement any Put request specific check and prepare logic here. Please refer to
3124     * {@link #checkAndPrepareMutation(Mutation, long)} for how its used.
3125     */
3126    protected abstract void checkAndPreparePut(final Put p) throws IOException;
3127
3128    /**
3129     * If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell
3130     * count, tags and timestamp for all cells of all operations in a mini-batch.
3131     */
3132    public abstract void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation>
3133        miniBatchOp, long timestamp, final List<RowLock> acquiredRowLocks) throws IOException;
3134
3135    /**
3136     * Write mini-batch operations to MemStore
3137     */
3138    public abstract WriteEntry writeMiniBatchOperationsToMemStore(
3139        final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry)
3140        throws IOException;
3141
3142    protected void writeMiniBatchOperationsToMemStore(
3143        final MiniBatchOperationInProgress<Mutation> miniBatchOp, final long writeNumber)
3144        throws IOException {
3145      MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing();
3146      visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
3147        // We need to update the sequence id for following reasons.
3148        // 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id.
3149        // 2) If no WAL, FSWALEntry won't be used
3150        // we use durability of the original mutation for the mutation passed by CP.
3151        if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) {
3152          region.updateSequenceId(familyCellMaps[index].values(), writeNumber);
3153        }
3154        applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting);
3155        return true;
3156      });
3157      // update memStore size
3158      region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(),
3159        memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount());
3160    }
3161
3162    public boolean isDone() {
3163      return nextIndexToProcess == operations.length;
3164    }
3165
3166    public int size() {
3167      return operations.length;
3168    }
3169
3170    public boolean isOperationPending(int index) {
3171      return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN;
3172    }
3173
3174    public List<UUID> getClusterIds() {
3175      assert size() != 0;
3176      return getMutation(0).getClusterIds();
3177    }
3178
3179    boolean isAtomic() {
3180      return atomic;
3181    }
3182
3183    /**
3184     * Helper method that checks and prepares only one mutation. This can be used to implement
3185     * {@link #checkAndPrepare()} for entire Batch.
3186     * NOTE: As CP prePut()/ preDelete() hooks may modify mutations, this method should be called
3187     * after prePut()/ preDelete() CP hooks are run for the mutation
3188     */
3189    protected void checkAndPrepareMutation(Mutation mutation, final long timestamp)
3190        throws IOException {
3191      region.checkRow(mutation.getRow(), "batchMutate");
3192      if (mutation instanceof Put) {
3193        // Check the families in the put. If bad, skip this one.
3194        checkAndPreparePut((Put) mutation);
3195        region.checkTimestamps(mutation.getFamilyCellMap(), timestamp);
3196      } else {
3197        region.prepareDelete((Delete) mutation);
3198      }
3199    }
3200
3201    protected void checkAndPrepareMutation(int index, long timestamp) throws IOException {
3202      Mutation mutation = getMutation(index);
3203      try {
3204        this.checkAndPrepareMutation(mutation, timestamp);
3205
3206        // store the family map reference to allow for mutations
3207        familyCellMaps[index] = mutation.getFamilyCellMap();
3208        // store durability for the batch (highest durability of all operations in the batch)
3209        Durability tmpDur = region.getEffectiveDurability(mutation.getDurability());
3210        if (tmpDur.ordinal() > durability.ordinal()) {
3211          durability = tmpDur;
3212        }
3213      } catch (NoSuchColumnFamilyException nscfe) {
3214        final String msg = "No such column family in batch mutation. ";
3215        if (observedExceptions.hasSeenNoSuchFamily()) {
3216          LOG.warn(msg + nscfe.getMessage());
3217        } else {
3218          LOG.warn(msg, nscfe);
3219          observedExceptions.sawNoSuchFamily();
3220        }
3221        retCodeDetails[index] = new OperationStatus(
3222            OperationStatusCode.BAD_FAMILY, nscfe.getMessage());
3223        if (isAtomic()) { // fail, atomic means all or none
3224          throw nscfe;
3225        }
3226      } catch (FailedSanityCheckException fsce) {
3227        final String msg = "Batch Mutation did not pass sanity check. ";
3228        if (observedExceptions.hasSeenFailedSanityCheck()) {
3229          LOG.warn(msg + fsce.getMessage());
3230        } else {
3231          LOG.warn(msg, fsce);
3232          observedExceptions.sawFailedSanityCheck();
3233        }
3234        retCodeDetails[index] = new OperationStatus(
3235            OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
3236        if (isAtomic()) {
3237          throw fsce;
3238        }
3239      } catch (WrongRegionException we) {
3240        final String msg = "Batch mutation had a row that does not belong to this region. ";
3241        if (observedExceptions.hasSeenWrongRegion()) {
3242          LOG.warn(msg + we.getMessage());
3243        } else {
3244          LOG.warn(msg, we);
3245          observedExceptions.sawWrongRegion();
3246        }
3247        retCodeDetails[index] = new OperationStatus(
3248            OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
3249        if (isAtomic()) {
3250          throw we;
3251        }
3252      }
3253    }
3254
3255    /**
3256     * Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which
3257     * a row lock can be acquired. All mutations with locked rows are considered to be
3258     * In-progress operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch
3259     * is window over {@link BatchOperation} and contains contiguous pending operations.
3260     *
3261     * @param acquiredRowLocks keeps track of rowLocks acquired.
3262     */
3263    public MiniBatchOperationInProgress<Mutation> lockRowsAndBuildMiniBatch(
3264        List<RowLock> acquiredRowLocks) throws IOException {
3265      int readyToWriteCount = 0;
3266      int lastIndexExclusive = 0;
3267      RowLock prevRowLock = null;
3268      for (; lastIndexExclusive < size(); lastIndexExclusive++) {
3269        // It reaches the miniBatchSize, stop here and process the miniBatch
3270        // This only applies to non-atomic batch operations.
3271        if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) {
3272          break;
3273        }
3274
3275        if (!isOperationPending(lastIndexExclusive)) {
3276          continue;
3277        }
3278        Mutation mutation = getMutation(lastIndexExclusive);
3279        // If we haven't got any rows in our batch, we should block to get the next one.
3280        RowLock rowLock = null;
3281        try {
3282          // if atomic then get exclusive lock, else shared lock
3283          rowLock = region.getRowLockInternal(mutation.getRow(), !isAtomic(), prevRowLock);
3284        } catch (TimeoutIOException | InterruptedIOException e) {
3285          // NOTE: We will retry when other exceptions, but we should stop if we receive
3286          // TimeoutIOException or InterruptedIOException as operation has timed out or
3287          // interrupted respectively.
3288          throw e;
3289        } catch (IOException ioe) {
3290          LOG.warn("Failed getting lock, row=" + Bytes.toStringBinary(mutation.getRow()), ioe);
3291          if (isAtomic()) { // fail, atomic means all or none
3292            throw ioe;
3293          }
3294        }
3295        if (rowLock == null) {
3296          // We failed to grab another lock
3297          if (isAtomic()) {
3298            throw new IOException("Can't apply all operations atomically!");
3299          }
3300          break; // Stop acquiring more rows for this batch
3301        } else {
3302          if (rowLock != prevRowLock) {
3303            // It is a different row now, add this to the acquiredRowLocks and
3304            // set prevRowLock to the new returned rowLock
3305            acquiredRowLocks.add(rowLock);
3306            prevRowLock = rowLock;
3307          }
3308        }
3309
3310        readyToWriteCount++;
3311      }
3312      return createMiniBatch(lastIndexExclusive, readyToWriteCount);
3313    }
3314
3315    protected MiniBatchOperationInProgress<Mutation> createMiniBatch(final int lastIndexExclusive,
3316        final int readyToWriteCount) {
3317      return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails,
3318          walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount);
3319    }
3320
3321    /**
3322     * Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are
3323     * present, they are merged to result WALEdit.
3324     */
3325    public List<Pair<NonceKey, WALEdit>> buildWALEdits(
3326        final MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException {
3327      List<Pair<NonceKey, WALEdit>> walEdits = new ArrayList<>();
3328
3329      visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() {
3330        private Pair<NonceKey, WALEdit> curWALEditForNonce;
3331
3332        @Override
3333        public boolean visit(int index) throws IOException {
3334          Mutation m = getMutation(index);
3335          // we use durability of the original mutation for the mutation passed by CP.
3336          if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) {
3337            region.recordMutationWithoutWal(m.getFamilyCellMap());
3338            return true;
3339          }
3340
3341          // the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each.
3342          // Given how nonce keys are originally written, these should be contiguous.
3343          // They don't have to be, it will still work, just write more WALEdits than needed.
3344          long nonceGroup = getNonceGroup(index);
3345          long nonce = getNonce(index);
3346          if (curWALEditForNonce == null ||
3347              curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup ||
3348              curWALEditForNonce.getFirst().getNonce() != nonce) {
3349            curWALEditForNonce = new Pair<>(new NonceKey(nonceGroup, nonce),
3350                new WALEdit(miniBatchOp.getCellCount(), isInReplay()));
3351            walEdits.add(curWALEditForNonce);
3352          }
3353          WALEdit walEdit = curWALEditForNonce.getSecond();
3354
3355          // Add WAL edits from CPs.
3356          WALEdit fromCP = walEditsFromCoprocessors[index];
3357          if (fromCP != null) {
3358            for (Cell cell : fromCP.getCells()) {
3359              walEdit.add(cell);
3360            }
3361          }
3362          walEdit.add(familyCellMaps[index]);
3363
3364          return true;
3365        }
3366      });
3367      return walEdits;
3368    }
3369
3370    /**
3371     * This method completes mini-batch operations by calling postBatchMutate() CP hook (if
3372     * required) and completing mvcc.
3373     */
3374    public void completeMiniBatchOperations(
3375        final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry)
3376        throws IOException {
3377      if (writeEntry != null) {
3378        region.mvcc.completeAndWait(writeEntry);
3379      }
3380    }
3381
3382    public void doPostOpCleanupForMiniBatch(
3383        final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WALEdit walEdit,
3384        boolean success) throws IOException {}
3385
3386    /**
3387     * Atomically apply the given map of family->edits to the memstore.
3388     * This handles the consistency control on its own, but the caller
3389     * should already have locked updatesLock.readLock(). This also does
3390     * <b>not</b> check the families for validity.
3391     *
3392     * @param familyMap Map of Cells by family
3393     */
3394    protected void applyFamilyMapToMemStore(Map<byte[], List<Cell>> familyMap,
3395        MemStoreSizing memstoreAccounting) throws IOException {
3396      for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3397        byte[] family = e.getKey();
3398        List<Cell> cells = e.getValue();
3399        assert cells instanceof RandomAccess;
3400        region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting);
3401      }
3402    }
3403  }
3404
3405
3406  /**
3407   * Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most
3408   * of the logic is same.
3409   */
3410  static class MutationBatchOperation extends BatchOperation<Mutation> {
3411    private long nonceGroup;
3412    private long nonce;
3413    public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic,
3414        long nonceGroup, long nonce) {
3415      super(region, operations);
3416      this.atomic = atomic;
3417      this.nonceGroup = nonceGroup;
3418      this.nonce = nonce;
3419    }
3420
3421    @Override
3422    public Mutation getMutation(int index) {
3423      return this.operations[index];
3424    }
3425
3426    @Override
3427    public long getNonceGroup(int index) {
3428      return nonceGroup;
3429    }
3430
3431    @Override
3432    public long getNonce(int index) {
3433      return nonce;
3434    }
3435
3436    @Override
3437    public Mutation[] getMutationsForCoprocs() {
3438      return this.operations;
3439    }
3440
3441    @Override
3442    public boolean isInReplay() {
3443      return false;
3444    }
3445
3446    @Override
3447    public long getOrigLogSeqNum() {
3448      return SequenceId.NO_SEQUENCE_ID;
3449    }
3450
3451    @Override
3452    public void startRegionOperation() throws IOException {
3453      region.startRegionOperation(Operation.BATCH_MUTATE);
3454    }
3455
3456    @Override
3457    public void closeRegionOperation() throws IOException {
3458      region.closeRegionOperation(Operation.BATCH_MUTATE);
3459    }
3460
3461    @Override
3462    public void checkAndPreparePut(Put p) throws IOException {
3463      region.checkFamilies(p.getFamilyCellMap().keySet());
3464    }
3465
3466    @Override
3467    public void checkAndPrepare() throws IOException {
3468      final int[] metrics = {0, 0}; // index 0: puts, index 1: deletes
3469      visitBatchOperations(true, this.size(), new Visitor() {
3470        private long now = EnvironmentEdgeManager.currentTime();
3471        private WALEdit walEdit;
3472        @Override
3473        public boolean visit(int index) throws IOException {
3474          // Run coprocessor pre hook outside of locks to avoid deadlock
3475          if (region.coprocessorHost != null) {
3476            if (walEdit == null) {
3477              walEdit = new WALEdit();
3478            }
3479            callPreMutateCPHook(index, walEdit, metrics);
3480            if (!walEdit.isEmpty()) {
3481              walEditsFromCoprocessors[index] = walEdit;
3482              walEdit = null;
3483            }
3484          }
3485          if (isOperationPending(index)) {
3486            // TODO: Currently validation is done with current time before acquiring locks and
3487            // updates are done with different timestamps after acquiring locks. This behavior is
3488            // inherited from the code prior to this change. Can this be changed?
3489            checkAndPrepareMutation(index, now);
3490          }
3491          return true;
3492        }
3493      });
3494
3495      // FIXME: we may update metrics twice! here for all operations bypassed by CP and later in
3496      // normal processing.
3497      // Update metrics in same way as it is done when we go the normal processing route (we now
3498      // update general metrics though a Coprocessor did the work).
3499      if (region.metricsRegion != null) {
3500        if (metrics[0] > 0) {
3501          // There were some Puts in the batch.
3502          region.metricsRegion.updatePut();
3503        }
3504        if (metrics[1] > 0) {
3505          // There were some Deletes in the batch.
3506          region.metricsRegion.updateDelete();
3507        }
3508      }
3509    }
3510
3511    @Override
3512    public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp,
3513        long timestamp, final List<RowLock> acquiredRowLocks) throws IOException {
3514      byte[] byteTS = Bytes.toBytes(timestamp);
3515      visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
3516        Mutation mutation = getMutation(index);
3517        if (mutation instanceof Put) {
3518          region.updateCellTimestamps(familyCellMaps[index].values(), byteTS);
3519          miniBatchOp.incrementNumOfPuts();
3520        } else {
3521          region.prepareDeleteTimestamps(mutation, familyCellMaps[index], byteTS);
3522          miniBatchOp.incrementNumOfDeletes();
3523        }
3524        region.rewriteCellTags(familyCellMaps[index], mutation);
3525
3526        // update cell count
3527        if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) {
3528          for (List<Cell> cells : mutation.getFamilyCellMap().values()) {
3529            miniBatchOp.addCellCount(cells.size());
3530          }
3531        }
3532
3533        WALEdit fromCP = walEditsFromCoprocessors[index];
3534        if (fromCP != null) {
3535          miniBatchOp.addCellCount(fromCP.size());
3536        }
3537        return true;
3538      });
3539
3540      if (region.coprocessorHost != null) {
3541        // calling the pre CP hook for batch mutation
3542        region.coprocessorHost.preBatchMutate(miniBatchOp);
3543        checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp);
3544      }
3545    }
3546
3547    @Override
3548    public List<Pair<NonceKey, WALEdit>> buildWALEdits(final MiniBatchOperationInProgress<Mutation>
3549        miniBatchOp) throws IOException {
3550      List<Pair<NonceKey, WALEdit>> walEdits = super.buildWALEdits(miniBatchOp);
3551      // for MutationBatchOperation, more than one nonce is not allowed
3552      if (walEdits.size() > 1) {
3553        throw new IOException("Found multiple nonce keys per batch!");
3554      }
3555      return walEdits;
3556    }
3557
3558    @Override
3559    public WriteEntry writeMiniBatchOperationsToMemStore(
3560        final MiniBatchOperationInProgress<Mutation> miniBatchOp, @Nullable WriteEntry writeEntry)
3561        throws IOException {
3562      if (writeEntry == null) {
3563        writeEntry = region.mvcc.begin();
3564      }
3565      super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber());
3566      return writeEntry;
3567    }
3568
3569    @Override
3570    public void completeMiniBatchOperations(
3571        final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry)
3572        throws IOException {
3573      // TODO: can it be done after completing mvcc?
3574      // calling the post CP hook for batch mutation
3575      if (region.coprocessorHost != null) {
3576        region.coprocessorHost.postBatchMutate(miniBatchOp);
3577      }
3578      super.completeMiniBatchOperations(miniBatchOp, writeEntry);
3579    }
3580
3581    @Override
3582    public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress<Mutation> miniBatchOp,
3583        final WALEdit walEdit, boolean success) throws IOException {
3584      if (miniBatchOp != null) {
3585        // synced so that the coprocessor contract is adhered to.
3586        if (region.coprocessorHost != null) {
3587          visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> {
3588            // only for successful puts
3589            if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) {
3590              Mutation m = getMutation(i);
3591              if (m instanceof Put) {
3592                region.coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
3593              } else {
3594                region.coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
3595              }
3596            }
3597            return true;
3598          });
3599        }
3600
3601        // See if the column families were consistent through the whole thing.
3602        // if they were then keep them. If they were not then pass a null.
3603        // null will be treated as unknown.
3604        // Total time taken might be involving Puts and Deletes.
3605        // Split the time for puts and deletes based on the total number of Puts and Deletes.
3606        if (region.metricsRegion != null) {
3607          if (miniBatchOp.getNumOfPuts() > 0) {
3608            // There were some Puts in the batch.
3609            region.metricsRegion.updatePut();
3610          }
3611          if (miniBatchOp.getNumOfDeletes() > 0) {
3612            // There were some Deletes in the batch.
3613            region.metricsRegion.updateDelete();
3614          }
3615        }
3616      }
3617
3618      if (region.coprocessorHost != null) {
3619        // call the coprocessor hook to do any finalization steps after the put is done
3620        region.coprocessorHost.postBatchMutateIndispensably(
3621            miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success);
3622      }
3623    }
3624
3625    /**
3626     * Runs prePut/ preDelete coprocessor hook for input mutation in a batch
3627     * @param metrics Array of 2 ints. index 0: count of puts and index 1: count of deletes
3628     */
3629    private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics)
3630        throws IOException {
3631      Mutation m = getMutation(index);
3632      if (m instanceof Put) {
3633        if (region.coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
3634          // pre hook says skip this Put
3635          // mark as success and skip in doMiniBatchMutation
3636          metrics[0]++;
3637          retCodeDetails[index] = OperationStatus.SUCCESS;
3638        }
3639      } else if (m instanceof Delete) {
3640        Delete curDel = (Delete) m;
3641        if (curDel.getFamilyCellMap().isEmpty()) {
3642          // handle deleting a row case
3643          // TODO: prepareDelete() has been called twice, before and after preDelete() CP hook.
3644          // Can this be avoided?
3645          region.prepareDelete(curDel);
3646        }
3647        if (region.coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
3648          // pre hook says skip this Delete
3649          // mark as success and skip in doMiniBatchMutation
3650          metrics[1]++;
3651          retCodeDetails[index] = OperationStatus.SUCCESS;
3652        }
3653      } else {
3654        String msg = "Put/Delete mutations only supported in a batch";
3655        // In case of passing Append mutations along with the Puts and Deletes in batchMutate
3656        // mark the operation return code as failure so that it will not be considered in
3657        // the doMiniBatchMutation
3658        retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg);
3659
3660        if (isAtomic()) { // fail, atomic means all or none
3661          throw new IOException(msg);
3662        }
3663      }
3664    }
3665
3666    private void checkAndMergeCPMutations(final MiniBatchOperationInProgress<Mutation> miniBatchOp,
3667        final List<RowLock> acquiredRowLocks, final long timestamp) throws IOException {
3668      visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> {
3669        // we pass (i - firstIndex) below since the call expects a relative index
3670        Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess);
3671        if (cpMutations == null) {
3672          return true;
3673        }
3674        // Else Coprocessor added more Mutations corresponding to the Mutation at this index.
3675        Mutation mutation = getMutation(i);
3676        for (Mutation cpMutation : cpMutations) {
3677          this.checkAndPrepareMutation(cpMutation, timestamp);
3678
3679          // Acquire row locks. If not, the whole batch will fail.
3680          acquiredRowLocks.add(region.getRowLockInternal(cpMutation.getRow(), true, null));
3681
3682          // Returned mutations from coprocessor correspond to the Mutation at index i. We can
3683          // directly add the cells from those mutations to the familyMaps of this mutation.
3684          Map<byte[], List<Cell>> cpFamilyMap = cpMutation.getFamilyCellMap();
3685          region.rewriteCellTags(cpFamilyMap, mutation);
3686          // will get added to the memStore later
3687          mergeFamilyMaps(familyCellMaps[i], cpFamilyMap);
3688
3689          // The durability of returned mutation is replaced by the corresponding mutation.
3690          // If the corresponding mutation contains the SKIP_WAL, we shouldn't count the
3691          // cells of returned mutation.
3692          if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) {
3693            for (List<Cell> cells : cpFamilyMap.values()) {
3694              miniBatchOp.addCellCount(cells.size());
3695            }
3696          }
3697        }
3698        return true;
3699      });
3700    }
3701
3702    private void mergeFamilyMaps(Map<byte[], List<Cell>> familyMap,
3703        Map<byte[], List<Cell>> toBeMerged) {
3704      for (Map.Entry<byte[], List<Cell>> entry : toBeMerged.entrySet()) {
3705        List<Cell> cells = familyMap.get(entry.getKey());
3706        if (cells == null) {
3707          familyMap.put(entry.getKey(), entry.getValue());
3708        } else {
3709          cells.addAll(entry.getValue());
3710        }
3711      }
3712    }
3713  }
3714
3715  /**
3716   * Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most
3717   * of the logic is same.
3718   */
3719  static class ReplayBatchOperation extends BatchOperation<MutationReplay> {
3720    private long origLogSeqNum = 0;
3721    public ReplayBatchOperation(final HRegion region, MutationReplay[] operations,
3722        long origLogSeqNum) {
3723      super(region, operations);
3724      this.origLogSeqNum = origLogSeqNum;
3725    }
3726
3727    @Override
3728    public Mutation getMutation(int index) {
3729      return this.operations[index].mutation;
3730    }
3731
3732    @Override
3733    public long getNonceGroup(int index) {
3734      return this.operations[index].nonceGroup;
3735    }
3736
3737    @Override
3738    public long getNonce(int index) {
3739      return this.operations[index].nonce;
3740    }
3741
3742    @Override
3743    public Mutation[] getMutationsForCoprocs() {
3744      return null;
3745    }
3746
3747    @Override
3748    public boolean isInReplay() {
3749      return true;
3750    }
3751
3752    @Override
3753    public long getOrigLogSeqNum() {
3754      return this.origLogSeqNum;
3755    }
3756
3757    @Override
3758    public void startRegionOperation() throws IOException {
3759      region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE);
3760    }
3761
3762    @Override
3763    public void closeRegionOperation() throws IOException {
3764      region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE);
3765    }
3766
3767    /**
3768     * During replay, there could exist column families which are removed between region server
3769     * failure and replay
3770     */
3771    @Override
3772    protected void checkAndPreparePut(Put p) throws IOException {
3773      Map<byte[], List<Cell>> familyCellMap = p.getFamilyCellMap();
3774      List<byte[]> nonExistentList = null;
3775      for (byte[] family : familyCellMap.keySet()) {
3776        if (!region.htableDescriptor.hasColumnFamily(family)) {
3777          if (nonExistentList == null) {
3778            nonExistentList = new ArrayList<>();
3779          }
3780          nonExistentList.add(family);
3781        }
3782      }
3783      if (nonExistentList != null) {
3784        for (byte[] family : nonExistentList) {
3785          // Perhaps schema was changed between crash and replay
3786          LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3787          familyCellMap.remove(family);
3788        }
3789      }
3790    }
3791
3792    @Override
3793    public void checkAndPrepare() throws IOException {
3794      long now = EnvironmentEdgeManager.currentTime();
3795      visitBatchOperations(true, this.size(), (int index) -> {
3796        checkAndPrepareMutation(index, now);
3797        return true;
3798      });
3799    }
3800
3801    @Override
3802    public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp,
3803        long timestamp, final List<RowLock> acquiredRowLocks) throws IOException {
3804      visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> {
3805        // update cell count
3806        for (List<Cell> cells : getMutation(index).getFamilyCellMap().values()) {
3807          miniBatchOp.addCellCount(cells.size());
3808        }
3809        return true;
3810      });
3811    }
3812
3813    @Override
3814    public WriteEntry writeMiniBatchOperationsToMemStore(
3815        final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry)
3816        throws IOException {
3817      super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum());
3818      return writeEntry;
3819    }
3820
3821    @Override
3822    public void completeMiniBatchOperations(
3823        final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry)
3824        throws IOException {
3825      super.completeMiniBatchOperations(miniBatchOp, writeEntry);
3826      region.mvcc.advanceTo(getOrigLogSeqNum());
3827    }
3828  }
3829
3830  public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce)
3831      throws IOException {
3832    return batchMutate(mutations, false, nonceGroup, nonce);
3833  }
3834
3835  public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup,
3836      long nonce) throws IOException {
3837    // As it stands, this is used for 3 things
3838    //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
3839    //  * coprocessor calls (see ex. BulkDeleteEndpoint).
3840    // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
3841    return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce));
3842  }
3843
3844  @Override
3845  public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
3846    return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
3847  }
3848
3849  public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
3850      throws IOException {
3851    if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
3852        && replaySeqId < lastReplayedOpenRegionSeqId) {
3853      // if it is a secondary replica we should ignore these entries silently
3854      // since they are coming out of order
3855      if (LOG.isTraceEnabled()) {
3856        LOG.trace(getRegionInfo().getEncodedName() + " : "
3857          + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
3858          + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
3859        for (MutationReplay mut : mutations) {
3860          LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
3861        }
3862      }
3863
3864      OperationStatus[] statuses = new OperationStatus[mutations.length];
3865      for (int i = 0; i < statuses.length; i++) {
3866        statuses[i] = OperationStatus.SUCCESS;
3867      }
3868      return statuses;
3869    }
3870    return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId));
3871  }
3872
3873  /**
3874   * Perform a batch of mutations.
3875   *
3876   * It supports only Put and Delete mutations and will ignore other types passed. Operations in
3877   * a batch are stored with highest durability specified of for all operations in a batch,
3878   * except for {@link Durability#SKIP_WAL}.
3879   *
3880   * <p>This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with
3881   * {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[], long, long)} with
3882   * {@link MutationBatchOperation} instance as an argument. As the processing of replay batch
3883   * and mutation batch is very similar, lot of code is shared by providing generic methods in
3884   * base class {@link BatchOperation}. The logic for this method and
3885   * {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which
3886   * are overridden by derived classes to implement special behavior.
3887   *
3888   * @param batchOp contains the list of mutations
3889   * @return an array of OperationStatus which internally contains the
3890   *         OperationStatusCode and the exceptionMessage if any.
3891   * @throws IOException if an IO problem is encountered
3892   */
3893  OperationStatus[] batchMutate(BatchOperation<?> batchOp) throws IOException {
3894    boolean initialized = false;
3895    batchOp.startRegionOperation();
3896    try {
3897      while (!batchOp.isDone()) {
3898        if (!batchOp.isInReplay()) {
3899          checkReadOnly();
3900        }
3901        checkResources();
3902
3903        if (!initialized) {
3904          this.writeRequestsCount.add(batchOp.size());
3905          // validate and prepare batch for write, for MutationBatchOperation it also calls CP
3906          // prePut()/ preDelete() hooks
3907          batchOp.checkAndPrepare();
3908          initialized = true;
3909        }
3910        doMiniBatchMutate(batchOp);
3911        requestFlushIfNeeded();
3912      }
3913    } finally {
3914      batchOp.closeRegionOperation();
3915    }
3916    return batchOp.retCodeDetails;
3917  }
3918
3919  /**
3920   * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[], long, long)}
3921   * In here we also handle replay of edits on region recover. Also gets change in size brought
3922   * about by applying {@code batchOp}.
3923   */
3924  private void doMiniBatchMutate(BatchOperation<?> batchOp) throws IOException {
3925    boolean success = false;
3926    WALEdit walEdit = null;
3927    WriteEntry writeEntry = null;
3928    boolean locked = false;
3929    // We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive)
3930    MiniBatchOperationInProgress<Mutation> miniBatchOp = null;
3931    /** Keep track of the locks we hold so we can release them in finally clause */
3932    List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size());
3933    try {
3934      // STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with
3935      // locked rows
3936      miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks);
3937
3938      // We've now grabbed as many mutations off the list as we can
3939      // Ensure we acquire at least one.
3940      if (miniBatchOp.getReadyToWriteCount() <= 0) {
3941        // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
3942        return;
3943      }
3944
3945      lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount());
3946      locked = true;
3947
3948      // STEP 2. Update mini batch of all operations in progress with  LATEST_TIMESTAMP timestamp
3949      // We should record the timestamp only after we have acquired the rowLock,
3950      // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
3951      long now = EnvironmentEdgeManager.currentTime();
3952      batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks);
3953
3954      // STEP 3. Build WAL edit
3955      List<Pair<NonceKey, WALEdit>> walEdits = batchOp.buildWALEdits(miniBatchOp);
3956
3957      // STEP 4. Append the WALEdits to WAL and sync.
3958      for(Iterator<Pair<NonceKey, WALEdit>> it = walEdits.iterator(); it.hasNext();) {
3959        Pair<NonceKey, WALEdit> nonceKeyWALEditPair = it.next();
3960        walEdit = nonceKeyWALEditPair.getSecond();
3961        NonceKey nonceKey = nonceKeyWALEditPair.getFirst();
3962
3963        if (walEdit != null && !walEdit.isEmpty()) {
3964          writeEntry = doWALAppend(walEdit, batchOp.durability, batchOp.getClusterIds(), now,
3965              nonceKey.getNonceGroup(), nonceKey.getNonce(), batchOp.getOrigLogSeqNum());
3966        }
3967
3968        // Complete mvcc for all but last writeEntry (for replay case)
3969        if (it.hasNext() && writeEntry != null) {
3970          mvcc.complete(writeEntry);
3971          writeEntry = null;
3972        }
3973      }
3974
3975      // STEP 5. Write back to memStore
3976      // NOTE: writeEntry can be null here
3977      writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry);
3978
3979      // STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and
3980      // complete mvcc for last writeEntry
3981      batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry);
3982      writeEntry = null;
3983      success = true;
3984    } finally {
3985      // Call complete rather than completeAndWait because we probably had error if walKey != null
3986      if (writeEntry != null) mvcc.complete(writeEntry);
3987
3988      if (locked) {
3989        this.updatesLock.readLock().unlock();
3990      }
3991      releaseRowLocks(acquiredRowLocks);
3992
3993      final int finalLastIndexExclusive =
3994          miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size();
3995      final boolean finalSuccess = success;
3996      batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> {
3997        batchOp.retCodeDetails[i] =
3998            finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE;
3999        return true;
4000      });
4001
4002      batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess);
4003
4004      batchOp.nextIndexToProcess = finalLastIndexExclusive;
4005    }
4006  }
4007
4008  /**
4009   * Returns effective durability from the passed durability and
4010   * the table descriptor.
4011   */
4012  protected Durability getEffectiveDurability(Durability d) {
4013    return d == Durability.USE_DEFAULT ? this.regionDurability : d;
4014  }
4015
4016  @Override
4017  public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op,
4018    ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException {
4019    checkMutationType(mutation, row);
4020    return doCheckAndRowMutate(row, family, qualifier, op, comparator, timeRange, null, mutation);
4021  }
4022
4023  @Override
4024  public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op,
4025    ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException {
4026    return doCheckAndRowMutate(row, family, qualifier, op, comparator, timeRange, rm, null);
4027  }
4028
4029  /**
4030   * checkAndMutate and checkAndRowMutate are 90% the same. Rather than copy/paste, below has
4031   * switches in the few places where there is deviation.
4032   */
4033  private boolean doCheckAndRowMutate(byte[] row, byte[] family, byte[] qualifier,
4034    CompareOperator op, ByteArrayComparable comparator, TimeRange timeRange,
4035    RowMutations rowMutations, Mutation mutation)
4036  throws IOException {
4037    // Could do the below checks but seems wacky with two callers only. Just comment out for now.
4038    // One caller passes a Mutation, the other passes RowMutation. Presume all good so we don't
4039    // need these commented out checks.
4040    // if (rowMutations == null && mutation == null) throw new DoNotRetryIOException("Both null");
4041    // if (rowMutations != null && mutation != null) throw new DoNotRetryIOException("Both set");
4042    checkReadOnly();
4043    // TODO, add check for value length also move this check to the client
4044    checkResources();
4045    startRegionOperation();
4046    try {
4047      Get get = new Get(row);
4048      checkFamily(family);
4049      get.addColumn(family, qualifier);
4050      if (timeRange != null) {
4051        get.setTimeRange(timeRange.getMin(), timeRange.getMax());
4052      }
4053      // Lock row - note that doBatchMutate will relock this row if called
4054      checkRow(row, "doCheckAndRowMutate");
4055      RowLock rowLock = getRowLockInternal(get.getRow(), false, null);
4056      try {
4057        if (mutation != null && this.getCoprocessorHost() != null) {
4058          // Call coprocessor.
4059          Boolean processed = null;
4060          if (mutation instanceof Put) {
4061            processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
4062                qualifier, op, comparator, (Put)mutation);
4063          } else if (mutation instanceof Delete) {
4064            processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
4065                qualifier, op, comparator, (Delete)mutation);
4066          }
4067          if (processed != null) {
4068            return processed;
4069          }
4070        }
4071        // NOTE: We used to wait here until mvcc caught up:  mvcc.await();
4072        // Supposition is that now all changes are done under row locks, then when we go to read,
4073        // we'll get the latest on this row.
4074        List<Cell> result = get(get, false);
4075        boolean valueIsNull = comparator.getValue() == null || comparator.getValue().length == 0;
4076        boolean matches = false;
4077        long cellTs = 0;
4078        if (result.isEmpty() && valueIsNull) {
4079          matches = true;
4080        } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && valueIsNull) {
4081          matches = true;
4082          cellTs = result.get(0).getTimestamp();
4083        } else if (result.size() == 1 && !valueIsNull) {
4084          Cell kv = result.get(0);
4085          cellTs = kv.getTimestamp();
4086          int compareResult = PrivateCellUtil.compareValue(kv, comparator);
4087          matches = matches(op, compareResult);
4088        }
4089        // If matches put the new put or delete the new delete
4090        if (matches) {
4091          // We have acquired the row lock already. If the system clock is NOT monotonically
4092          // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
4093          // larger timestamp than what was observed via Get. doBatchMutate already does this, but
4094          // there is no way to pass the cellTs. See HBASE-14054.
4095          long now = EnvironmentEdgeManager.currentTime();
4096          long ts = Math.max(now, cellTs); // ensure write is not eclipsed
4097          byte[] byteTs = Bytes.toBytes(ts);
4098          if (mutation != null) {
4099            if (mutation instanceof Put) {
4100              updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs);
4101            }
4102            // And else 'delete' is not needed since it already does a second get, and sets the
4103            // timestamp from get (see prepareDeleteTimestamps).
4104          } else {
4105            for (Mutation m: rowMutations.getMutations()) {
4106              if (m instanceof Put) {
4107                updateCellTimestamps(m.getFamilyCellMap().values(), byteTs);
4108              }
4109            }
4110            // And else 'delete' is not needed since it already does a second get, and sets the
4111            // timestamp from get (see prepareDeleteTimestamps).
4112          }
4113          // All edits for the given row (across all column families) must happen atomically.
4114          if (mutation != null) {
4115            doBatchMutate(mutation);
4116          } else {
4117            mutateRow(rowMutations);
4118          }
4119          this.checkAndMutateChecksPassed.increment();
4120          return true;
4121        }
4122        this.checkAndMutateChecksFailed.increment();
4123        return false;
4124      } finally {
4125        rowLock.release();
4126      }
4127    } finally {
4128      closeRegionOperation();
4129    }
4130  }
4131
4132  private void checkMutationType(final Mutation mutation, final byte [] row)
4133  throws DoNotRetryIOException {
4134    boolean isPut = mutation instanceof Put;
4135    if (!isPut && !(mutation instanceof Delete)) {
4136      throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must be Put or Delete");
4137    }
4138    if (!Bytes.equals(row, mutation.getRow())) {
4139      throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match");
4140    }
4141  }
4142
4143  private boolean matches(final CompareOperator op, final int compareResult) {
4144    boolean matches = false;
4145    switch (op) {
4146      case LESS:
4147        matches = compareResult < 0;
4148        break;
4149      case LESS_OR_EQUAL:
4150        matches = compareResult <= 0;
4151        break;
4152      case EQUAL:
4153        matches = compareResult == 0;
4154        break;
4155      case NOT_EQUAL:
4156        matches = compareResult != 0;
4157        break;
4158      case GREATER_OR_EQUAL:
4159        matches = compareResult >= 0;
4160        break;
4161      case GREATER:
4162        matches = compareResult > 0;
4163        break;
4164      default:
4165        throw new RuntimeException("Unknown Compare op " + op.name());
4166    }
4167    return matches;
4168  }
4169
4170
4171  private void doBatchMutate(Mutation mutation) throws IOException {
4172    // Currently this is only called for puts and deletes, so no nonces.
4173    OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation});
4174    if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
4175      throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
4176    } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
4177      throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
4178    }
4179  }
4180
4181  /**
4182   * Complete taking the snapshot on the region. Writes the region info and adds references to the
4183   * working snapshot directory.
4184   *
4185   * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
4186   * arg.  (In the future other cancellable HRegion methods could eventually add a
4187   * {@link ForeignExceptionSnare}, or we could do something fancier).
4188   *
4189   * @param desc snapshot description object
4190   * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
4191   *   bail out.  This is allowed to be null and will just be ignored in that case.
4192   * @throws IOException if there is an external or internal error causing the snapshot to fail
4193   */
4194  public void addRegionToSnapshot(SnapshotDescription desc,
4195      ForeignExceptionSnare exnSnare) throws IOException {
4196    Path rootDir = FSUtils.getRootDir(conf);
4197    Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
4198
4199    SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
4200            snapshotDir, desc, exnSnare);
4201    manifest.addRegion(this);
4202  }
4203
4204  private void updateSequenceId(final Iterable<List<Cell>> cellItr, final long sequenceId)
4205      throws IOException {
4206    for (List<Cell> cells: cellItr) {
4207      if (cells == null) return;
4208      for (Cell cell : cells) {
4209        PrivateCellUtil.setSequenceId(cell, sequenceId);
4210      }
4211    }
4212  }
4213
4214  /**
4215   * Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP}
4216   * provided current timestamp.
4217   * @param cellItr
4218   * @param now
4219   */
4220  private static void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now)
4221      throws IOException {
4222    for (List<Cell> cells: cellItr) {
4223      if (cells == null) continue;
4224      // Optimization: 'foreach' loop is not used. See:
4225      // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
4226      assert cells instanceof RandomAccess;
4227      int listSize = cells.size();
4228      for (int i = 0; i < listSize; i++) {
4229        PrivateCellUtil.updateLatestStamp(cells.get(i), now);
4230      }
4231    }
4232  }
4233
4234  /**
4235   * Possibly rewrite incoming cell tags.
4236   */
4237  void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
4238    // Check if we have any work to do and early out otherwise
4239    // Update these checks as more logic is added here
4240    if (m.getTTL() == Long.MAX_VALUE) {
4241      return;
4242    }
4243
4244    // From this point we know we have some work to do
4245    for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
4246      List<Cell> cells = e.getValue();
4247      assert cells instanceof RandomAccess;
4248      int listSize = cells.size();
4249      for (int i = 0; i < listSize; i++) {
4250        Cell cell = cells.get(i);
4251        List<Tag> newTags = TagUtil.carryForwardTags(null, cell);
4252        newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL());
4253        // Rewrite the cell with the updated set of tags
4254        cells.set(i, PrivateCellUtil.createCell(cell, newTags));
4255      }
4256    }
4257  }
4258
4259  /*
4260   * Check if resources to support an update.
4261   *
4262   * We throw RegionTooBusyException if above memstore limit
4263   * and expect client to retry using some kind of backoff
4264  */
4265  void checkResources() throws RegionTooBusyException {
4266    // If catalog region, do not impose resource constraints or block updates.
4267    if (this.getRegionInfo().isMetaRegion()) return;
4268
4269    MemStoreSize mss = this.memStoreSizing.getMemStoreSize();
4270    if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) {
4271      blockedRequestsCount.increment();
4272      requestFlush();
4273      // Don't print current limit because it will vary too much. The message is used as a key
4274      // over in RetriesExhaustedWithDetailsException processing.
4275      throw new RegionTooBusyException("Over memstore limit=" +
4276        org.apache.hadoop.hbase.procedure2.util.StringUtils.humanSize(this.blockingMemStoreSize) +
4277        ", regionName=" +
4278          (this.getRegionInfo() == null? "unknown": this.getRegionInfo().getEncodedName()) +
4279          ", server=" + (this.getRegionServerServices() == null? "unknown":
4280              this.getRegionServerServices().getServerName()));
4281    }
4282  }
4283
4284  /**
4285   * @throws IOException Throws exception if region is in read-only mode.
4286   */
4287  protected void checkReadOnly() throws IOException {
4288    if (isReadOnly()) {
4289      throw new DoNotRetryIOException("region is read only");
4290    }
4291  }
4292
4293  protected void checkReadsEnabled() throws IOException {
4294    if (!this.writestate.readsEnabled) {
4295      throw new IOException(getRegionInfo().getEncodedName()
4296        + ": The region's reads are disabled. Cannot serve the request");
4297    }
4298  }
4299
4300  public void setReadsEnabled(boolean readsEnabled) {
4301   if (readsEnabled && !this.writestate.readsEnabled) {
4302     LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
4303    }
4304    this.writestate.setReadsEnabled(readsEnabled);
4305  }
4306
4307  /**
4308   * Add updates first to the wal and then add values to memstore.
4309   * Warning: Assumption is caller has lock on passed in row.
4310   * @param edits Cell updates by column
4311   * @throws IOException
4312   */
4313  void put(final byte [] row, byte [] family, List<Cell> edits)
4314  throws IOException {
4315    NavigableMap<byte[], List<Cell>> familyMap;
4316    familyMap = new TreeMap<>(Bytes.BYTES_COMPARATOR);
4317
4318    familyMap.put(family, edits);
4319    Put p = new Put(row);
4320    p.setFamilyCellMap(familyMap);
4321    doBatchMutate(p);
4322  }
4323
4324  /**
4325   * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be
4326   *          set; when set we will run operations that make sense in the increment/append scenario
4327   *          but that do not make sense otherwise.
4328   * @see #applyToMemStore(HStore, Cell, MemStoreSizing)
4329   */
4330  private void applyToMemStore(HStore store, List<Cell> cells, boolean delta,
4331      MemStoreSizing memstoreAccounting) throws IOException {
4332    // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!!
4333    boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1;
4334    if (upsert) {
4335      store.upsert(cells, getSmallestReadPoint(), memstoreAccounting);
4336    } else {
4337      store.add(cells, memstoreAccounting);
4338    }
4339  }
4340
4341  /**
4342   * @see #applyToMemStore(HStore, List, boolean, MemStoreSizing)
4343   */
4344  private void applyToMemStore(HStore store, Cell cell, MemStoreSizing memstoreAccounting)
4345      throws IOException {
4346    // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!!
4347    if (store == null) {
4348      checkFamily(CellUtil.cloneFamily(cell));
4349      // Unreachable because checkFamily will throw exception
4350    }
4351    store.add(cell, memstoreAccounting);
4352  }
4353
4354  /**
4355   * Check the collection of families for validity.
4356   * @param families
4357   * @throws NoSuchColumnFamilyException
4358   */
4359  public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException {
4360    for (byte[] family : families) {
4361      checkFamily(family);
4362    }
4363  }
4364
4365  /**
4366   * Check the collection of families for valid timestamps
4367   * @param familyMap
4368   * @param now current timestamp
4369   * @throws FailedSanityCheckException
4370   */
4371  public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now)
4372      throws FailedSanityCheckException {
4373    if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
4374      return;
4375    }
4376    long maxTs = now + timestampSlop;
4377    for (List<Cell> kvs : familyMap.values()) {
4378      // Optimization: 'foreach' loop is not used. See:
4379      // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
4380      assert kvs instanceof RandomAccess;
4381      int listSize  = kvs.size();
4382      for (int i=0; i < listSize; i++) {
4383        Cell cell = kvs.get(i);
4384        // see if the user-side TS is out of range. latest = server-side
4385        long ts = cell.getTimestamp();
4386        if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
4387          throw new FailedSanityCheckException("Timestamp for KV out of range "
4388              + cell + " (too.new=" + timestampSlop + ")");
4389        }
4390      }
4391    }
4392  }
4393
4394  /*
4395   * @param size
4396   * @return True if size is over the flush threshold
4397   */
4398  private boolean isFlushSize(MemStoreSize size) {
4399    return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize();
4400  }
4401
4402  /**
4403   * Read the edits put under this region by wal splitting process.  Put
4404   * the recovered edits back up into this region.
4405   *
4406   * <p>We can ignore any wal message that has a sequence ID that's equal to or
4407   * lower than minSeqId.  (Because we know such messages are already
4408   * reflected in the HFiles.)
4409   *
4410   * <p>While this is running we are putting pressure on memory yet we are
4411   * outside of our usual accounting because we are not yet an onlined region
4412   * (this stuff is being run as part of Region initialization).  This means
4413   * that if we're up against global memory limits, we'll not be flagged to flush
4414   * because we are not online. We can't be flushed by usual mechanisms anyways;
4415   * we're not yet online so our relative sequenceids are not yet aligned with
4416   * WAL sequenceids -- not till we come up online, post processing of split
4417   * edits.
4418   *
4419   * <p>But to help relieve memory pressure, at least manage our own heap size
4420   * flushing if are in excess of per-region limits.  Flushing, though, we have
4421   * to be careful and avoid using the regionserver/wal sequenceid.  Its running
4422   * on a different line to whats going on in here in this region context so if we
4423   * crashed replaying these edits, but in the midst had a flush that used the
4424   * regionserver wal with a sequenceid in excess of whats going on in here
4425   * in this region and with its split editlogs, then we could miss edits the
4426   * next time we go to recover. So, we have to flush inline, using seqids that
4427   * make sense in a this single region context only -- until we online.
4428   *
4429   * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
4430   * the maxSeqId for the store to be applied, else its skipped.
4431   * @return the sequence id of the last edit added to this region out of the
4432   * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
4433   * @throws IOException
4434   */
4435  protected long replayRecoveredEditsIfAny(Map<byte[], Long> maxSeqIdInStores,
4436      final CancelableProgressable reporter, final MonitoredTask status)
4437      throws IOException {
4438    long minSeqIdForTheRegion = -1;
4439    for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
4440      if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
4441        minSeqIdForTheRegion = maxSeqIdInStore;
4442      }
4443    }
4444    long seqId = minSeqIdForTheRegion;
4445
4446    FileSystem walFS = getWalFileSystem();
4447    FileSystem rootFS = getFilesystem();
4448    Path wrongRegionWALDir = FSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(),
4449      getRegionInfo().getEncodedName());
4450    Path regionWALDir = getWALRegionDir();
4451    Path regionDir = FSUtils.getRegionDirFromRootDir(FSUtils.getRootDir(conf), getRegionInfo());
4452
4453    // We made a mistake in HBASE-20734 so we need to do this dirty hack...
4454    NavigableSet<Path> filesUnderWrongRegionWALDir =
4455      WALSplitter.getSplitEditFilesSorted(walFS, wrongRegionWALDir);
4456    seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS,
4457      filesUnderWrongRegionWALDir, reporter, regionDir));
4458    // This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear
4459    // under the root dir even if walDir is set.
4460    NavigableSet<Path> filesUnderRootDir = Collections.emptyNavigableSet();
4461    if (!regionWALDir.equals(regionDir)) {
4462      filesUnderRootDir = WALSplitter.getSplitEditFilesSorted(rootFS, regionDir);
4463      seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS,
4464        filesUnderRootDir, reporter, regionDir));
4465    }
4466
4467    NavigableSet<Path> files = WALSplitter.getSplitEditFilesSorted(walFS, regionWALDir);
4468    seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS,
4469        files, reporter, regionWALDir));
4470
4471    if (seqId > minSeqIdForTheRegion) {
4472      // Then we added some edits to memory. Flush and cleanup split edit files.
4473      internalFlushcache(null, seqId, stores.values(), status, false, FlushLifeCycleTracker.DUMMY);
4474    }
4475    // Now delete the content of recovered edits. We're done w/ them.
4476    if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
4477      // For debugging data loss issues!
4478      // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
4479      // column family. Have to fake out file type too by casting our recovered.edits as storefiles
4480      String fakeFamilyName = WALSplitter.getRegionDirRecoveredEditsDir(regionWALDir).getName();
4481      Set<HStoreFile> fakeStoreFiles = new HashSet<>(files.size());
4482      for (Path file : files) {
4483        fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true));
4484      }
4485      getRegionWALFileSystem().removeStoreFiles(fakeFamilyName, fakeStoreFiles);
4486    } else {
4487      for (Path file : Iterables.concat(files, filesUnderWrongRegionWALDir)) {
4488        if (!walFS.delete(file, false)) {
4489          LOG.error("Failed delete of {}", file);
4490        } else {
4491          LOG.debug("Deleted recovered.edits file={}", file);
4492        }
4493      }
4494      for (Path file : filesUnderRootDir) {
4495        if (!rootFS.delete(file, false)) {
4496          LOG.error("Failed delete of {}", file);
4497        } else {
4498          LOG.debug("Deleted recovered.edits file={}", file);
4499        }
4500      }
4501    }
4502    return seqId;
4503  }
4504
4505  private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs,
4506      final NavigableSet<Path> files, final CancelableProgressable reporter, final Path regionDir)
4507      throws IOException {
4508    long seqid = minSeqIdForTheRegion;
4509    if (LOG.isDebugEnabled()) {
4510      LOG.debug("Found " + (files == null ? 0 : files.size())
4511          + " recovered edits file(s) under " + regionDir);
4512    }
4513
4514    if (files == null || files.isEmpty()) {
4515      return minSeqIdForTheRegion;
4516    }
4517
4518    for (Path edits: files) {
4519      if (edits == null || !fs.exists(edits)) {
4520        LOG.warn("Null or non-existent edits file: " + edits);
4521        continue;
4522      }
4523      if (isZeroLengthThenDelete(fs, edits)) continue;
4524
4525      long maxSeqId;
4526      String fileName = edits.getName();
4527      maxSeqId = Math.abs(Long.parseLong(fileName));
4528      if (maxSeqId <= minSeqIdForTheRegion) {
4529        if (LOG.isDebugEnabled()) {
4530          String msg = "Maximum sequenceid for this wal is " + maxSeqId
4531              + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
4532              + ", skipped the whole file, path=" + edits;
4533          LOG.debug(msg);
4534        }
4535        continue;
4536      }
4537
4538      try {
4539        // replay the edits. Replay can return -1 if everything is skipped, only update
4540        // if seqId is greater
4541        seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs));
4542      } catch (IOException e) {
4543        boolean skipErrors = conf.getBoolean(
4544            HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
4545            conf.getBoolean(
4546                "hbase.skip.errors",
4547                HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
4548        if (conf.get("hbase.skip.errors") != null) {
4549          LOG.warn(
4550              "The property 'hbase.skip.errors' has been deprecated. Please use " +
4551                  HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
4552        }
4553        if (skipErrors) {
4554          Path p = WALSplitter.moveAsideBadEditsFile(walFS, edits);
4555          LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
4556              + "=true so continuing. Renamed " + edits +
4557              " as " + p, e);
4558        } else {
4559          throw e;
4560        }
4561      }
4562    }
4563    return seqid;
4564  }
4565
4566  /*
4567   * @param edits File of recovered edits.
4568   * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in wal
4569   * must be larger than this to be replayed for each store.
4570   * @param reporter
4571   * @return the sequence id of the last edit added to this region out of the
4572   * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
4573   * @throws IOException
4574   */
4575  private long replayRecoveredEdits(final Path edits,
4576      Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter, FileSystem fs)
4577    throws IOException {
4578    String msg = "Replaying edits from " + edits;
4579    LOG.info(msg);
4580    MonitoredTask status = TaskMonitor.get().createStatus(msg);
4581
4582    status.setStatus("Opening recovered edits");
4583    WAL.Reader reader = null;
4584    try {
4585      reader = WALFactory.createReader(fs, edits, conf);
4586      long currentEditSeqId = -1;
4587      long currentReplaySeqId = -1;
4588      long firstSeqIdInLog = -1;
4589      long skippedEdits = 0;
4590      long editsCount = 0;
4591      long intervalEdits = 0;
4592      WAL.Entry entry;
4593      HStore store = null;
4594      boolean reported_once = false;
4595      ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
4596
4597      try {
4598        // How many edits seen before we check elapsed time
4599        int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
4600        // How often to send a progress report (default 1/2 master timeout)
4601        int period = this.conf.getInt("hbase.hstore.report.period", 300000);
4602        long lastReport = EnvironmentEdgeManager.currentTime();
4603
4604        if (coprocessorHost != null) {
4605          coprocessorHost.preReplayWALs(this.getRegionInfo(), edits);
4606        }
4607
4608        while ((entry = reader.next()) != null) {
4609          WALKey key = entry.getKey();
4610          WALEdit val = entry.getEdit();
4611
4612          if (ng != null) { // some test, or nonces disabled
4613            ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
4614          }
4615
4616          if (reporter != null) {
4617            intervalEdits += val.size();
4618            if (intervalEdits >= interval) {
4619              // Number of edits interval reached
4620              intervalEdits = 0;
4621              long cur = EnvironmentEdgeManager.currentTime();
4622              if (lastReport + period <= cur) {
4623                status.setStatus("Replaying edits..." +
4624                    " skipped=" + skippedEdits +
4625                    " edits=" + editsCount);
4626                // Timeout reached
4627                if(!reporter.progress()) {
4628                  msg = "Progressable reporter failed, stopping replay";
4629                  LOG.warn(msg);
4630                  status.abort(msg);
4631                  throw new IOException(msg);
4632                }
4633                reported_once = true;
4634                lastReport = cur;
4635              }
4636            }
4637          }
4638
4639          if (firstSeqIdInLog == -1) {
4640            firstSeqIdInLog = key.getSequenceId();
4641          }
4642          if (currentEditSeqId > key.getSequenceId()) {
4643            // when this condition is true, it means we have a serious defect because we need to
4644            // maintain increasing SeqId for WAL edits per region
4645            LOG.error(getRegionInfo().getEncodedName() + " : "
4646                 + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
4647                + "; edit=" + val);
4648          } else {
4649            currentEditSeqId = key.getSequenceId();
4650          }
4651          currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
4652            key.getOrigLogSeqNum() : currentEditSeqId;
4653
4654          // Start coprocessor replay here. The coprocessor is for each WALEdit
4655          // instead of a KeyValue.
4656          if (coprocessorHost != null) {
4657            status.setStatus("Running pre-WAL-restore hook in coprocessors");
4658            if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
4659              // if bypass this wal entry, ignore it ...
4660              continue;
4661            }
4662          }
4663          boolean checkRowWithinBoundary = false;
4664          // Check this edit is for this region.
4665          if (!Bytes.equals(key.getEncodedRegionName(),
4666              this.getRegionInfo().getEncodedNameAsBytes())) {
4667            checkRowWithinBoundary = true;
4668          }
4669
4670          boolean flush = false;
4671          MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing();
4672          for (Cell cell: val.getCells()) {
4673            // Check this edit is for me. Also, guard against writing the special
4674            // METACOLUMN info such as HBASE::CACHEFLUSH entries
4675            if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
4676              // if region names don't match, skipp replaying compaction marker
4677              if (!checkRowWithinBoundary) {
4678                //this is a special edit, we should handle it
4679                CompactionDescriptor compaction = WALEdit.getCompaction(cell);
4680                if (compaction != null) {
4681                  //replay the compaction
4682                  replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
4683                }
4684              }
4685              skippedEdits++;
4686              continue;
4687            }
4688            // Figure which store the edit is meant for.
4689            if (store == null || !CellUtil.matchingFamily(cell,
4690                store.getColumnFamilyDescriptor().getName())) {
4691              store = getStore(cell);
4692            }
4693            if (store == null) {
4694              // This should never happen.  Perhaps schema was changed between
4695              // crash and redeploy?
4696              LOG.warn("No family for " + cell);
4697              skippedEdits++;
4698              continue;
4699            }
4700            if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(),
4701              cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())) {
4702              LOG.warn("Row of " + cell + " is not within region boundary");
4703              skippedEdits++;
4704              continue;
4705            }
4706            // Now, figure if we should skip this edit.
4707            if (key.getSequenceId() <= maxSeqIdInStores.get(store.getColumnFamilyDescriptor()
4708                .getName())) {
4709              skippedEdits++;
4710              continue;
4711            }
4712            PrivateCellUtil.setSequenceId(cell, currentReplaySeqId);
4713
4714            restoreEdit(store, cell, memStoreSizing);
4715            editsCount++;
4716          }
4717          MemStoreSize mss = memStoreSizing.getMemStoreSize();
4718          incMemStoreSize(mss);
4719          flush = isFlushSize(this.memStoreSizing.getMemStoreSize());
4720          if (flush) {
4721            internalFlushcache(null, currentEditSeqId, stores.values(), status, false,
4722              FlushLifeCycleTracker.DUMMY);
4723          }
4724
4725          if (coprocessorHost != null) {
4726            coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
4727          }
4728        }
4729
4730        if (coprocessorHost != null) {
4731          coprocessorHost.postReplayWALs(this.getRegionInfo(), edits);
4732        }
4733      } catch (EOFException eof) {
4734        Path p = WALSplitter.moveAsideBadEditsFile(walFS, edits);
4735        msg = "EnLongAddered EOF. Most likely due to Master failure during " +
4736            "wal splitting, so we have this data in another edit.  " +
4737            "Continuing, but renaming " + edits + " as " + p;
4738        LOG.warn(msg, eof);
4739        status.abort(msg);
4740      } catch (IOException ioe) {
4741        // If the IOE resulted from bad file format,
4742        // then this problem is idempotent and retrying won't help
4743        if (ioe.getCause() instanceof ParseException) {
4744          Path p = WALSplitter.moveAsideBadEditsFile(walFS, edits);
4745          msg = "File corruption enLongAddered!  " +
4746              "Continuing, but renaming " + edits + " as " + p;
4747          LOG.warn(msg, ioe);
4748          status.setStatus(msg);
4749        } else {
4750          status.abort(StringUtils.stringifyException(ioe));
4751          // other IO errors may be transient (bad network connection,
4752          // checksum exception on one datanode, etc).  throw & retry
4753          throw ioe;
4754        }
4755      }
4756      if (reporter != null && !reported_once) {
4757        reporter.progress();
4758      }
4759      msg = "Applied " + editsCount + ", skipped " + skippedEdits +
4760        ", firstSequenceIdInLog=" + firstSeqIdInLog +
4761        ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
4762      status.markComplete(msg);
4763      LOG.debug(msg);
4764      return currentEditSeqId;
4765    } finally {
4766      status.cleanup();
4767      if (reader != null) {
4768         reader.close();
4769      }
4770    }
4771  }
4772
4773  /**
4774   * Call to complete a compaction. Its for the case where we find in the WAL a compaction
4775   * that was not finished.  We could find one recovering a WAL after a regionserver crash.
4776   * See HBASE-2331.
4777   */
4778  void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
4779      boolean removeFiles, long replaySeqId)
4780      throws IOException {
4781    try {
4782      checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
4783        "Compaction marker from WAL ", compaction);
4784    } catch (WrongRegionException wre) {
4785      if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4786        // skip the compaction marker since it is not for this region
4787        return;
4788      }
4789      throw wre;
4790    }
4791
4792    synchronized (writestate) {
4793      if (replaySeqId < lastReplayedOpenRegionSeqId) {
4794        LOG.warn(getRegionInfo().getEncodedName() + " : "
4795            + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4796            + " because its sequence id " + replaySeqId + " is smaller than this regions "
4797            + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4798        return;
4799      }
4800      if (replaySeqId < lastReplayedCompactionSeqId) {
4801        LOG.warn(getRegionInfo().getEncodedName() + " : "
4802            + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4803            + " because its sequence id " + replaySeqId + " is smaller than this regions "
4804            + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
4805        return;
4806      } else {
4807        lastReplayedCompactionSeqId = replaySeqId;
4808      }
4809
4810      if (LOG.isDebugEnabled()) {
4811        LOG.debug(getRegionInfo().getEncodedName() + " : "
4812            + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
4813            + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
4814            + lastReplayedOpenRegionSeqId);
4815      }
4816
4817      startRegionOperation(Operation.REPLAY_EVENT);
4818      try {
4819        HStore store = this.getStore(compaction.getFamilyName().toByteArray());
4820        if (store == null) {
4821          LOG.warn(getRegionInfo().getEncodedName() + " : "
4822              + "Found Compaction WAL edit for deleted family:"
4823              + Bytes.toString(compaction.getFamilyName().toByteArray()));
4824          return;
4825        }
4826        store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
4827        logRegionFiles();
4828      } catch (FileNotFoundException ex) {
4829        LOG.warn(getRegionInfo().getEncodedName() + " : "
4830            + "At least one of the store files in compaction: "
4831            + TextFormat.shortDebugString(compaction)
4832            + " doesn't exist any more. Skip loading the file(s)", ex);
4833      } finally {
4834        closeRegionOperation(Operation.REPLAY_EVENT);
4835      }
4836    }
4837  }
4838
4839  void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
4840    checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
4841      "Flush marker from WAL ", flush);
4842
4843    if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4844      return; // if primary nothing to do
4845    }
4846
4847    if (LOG.isDebugEnabled()) {
4848      LOG.debug(getRegionInfo().getEncodedName() + " : "
4849          + "Replaying flush marker " + TextFormat.shortDebugString(flush));
4850    }
4851
4852    startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
4853    try {
4854      FlushAction action = flush.getAction();
4855      switch (action) {
4856      case START_FLUSH:
4857        replayWALFlushStartMarker(flush);
4858        break;
4859      case COMMIT_FLUSH:
4860        replayWALFlushCommitMarker(flush);
4861        break;
4862      case ABORT_FLUSH:
4863        replayWALFlushAbortMarker(flush);
4864        break;
4865      case CANNOT_FLUSH:
4866        replayWALFlushCannotFlushMarker(flush, replaySeqId);
4867        break;
4868      default:
4869        LOG.warn(getRegionInfo().getEncodedName() + " : " +
4870          "Received a flush event with unknown action, ignoring. " +
4871          TextFormat.shortDebugString(flush));
4872        break;
4873      }
4874
4875      logRegionFiles();
4876    } finally {
4877      closeRegionOperation(Operation.REPLAY_EVENT);
4878    }
4879  }
4880
4881  /** Replay the flush marker from primary region by creating a corresponding snapshot of
4882   * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
4883   * edit (because the events may be coming out of order).
4884   */
4885  @VisibleForTesting
4886  PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
4887    long flushSeqId = flush.getFlushSequenceNumber();
4888
4889    HashSet<HStore> storesToFlush = new HashSet<>();
4890    for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4891      byte[] family = storeFlush.getFamilyName().toByteArray();
4892      HStore store = getStore(family);
4893      if (store == null) {
4894        LOG.warn(getRegionInfo().getEncodedName() + " : "
4895          + "Received a flush start marker from primary, but the family is not found. Ignoring"
4896          + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
4897        continue;
4898      }
4899      storesToFlush.add(store);
4900    }
4901
4902    MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
4903
4904    // we will use writestate as a coarse-grain lock for all the replay events
4905    // (flush, compaction, region open etc)
4906    synchronized (writestate) {
4907      try {
4908        if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4909          LOG.warn(getRegionInfo().getEncodedName() + " : "
4910              + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4911              + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4912              + " of " + lastReplayedOpenRegionSeqId);
4913          return null;
4914        }
4915        if (numMutationsWithoutWAL.sum() > 0) {
4916          numMutationsWithoutWAL.reset();
4917          dataInMemoryWithoutWAL.reset();
4918        }
4919
4920        if (!writestate.flushing) {
4921          // we do not have an active snapshot and corresponding this.prepareResult. This means
4922          // we can just snapshot our memstores and continue as normal.
4923
4924          // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
4925          PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId,
4926            storesToFlush, status, false, FlushLifeCycleTracker.DUMMY);
4927          if (prepareResult.result == null) {
4928            // save the PrepareFlushResult so that we can use it later from commit flush
4929            this.writestate.flushing = true;
4930            this.prepareFlushResult = prepareResult;
4931            status.markComplete("Flush prepare successful");
4932            if (LOG.isDebugEnabled()) {
4933              LOG.debug(getRegionInfo().getEncodedName() + " : "
4934                  + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
4935            }
4936          } else {
4937            // special case empty memstore. We will still save the flush result in this case, since
4938            // our memstore ie empty, but the primary is still flushing
4939            if (prepareResult.getResult().getResult() ==
4940                  FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
4941              this.writestate.flushing = true;
4942              this.prepareFlushResult = prepareResult;
4943              if (LOG.isDebugEnabled()) {
4944                LOG.debug(getRegionInfo().getEncodedName() + " : "
4945                  + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
4946              }
4947            }
4948            status.abort("Flush prepare failed with " + prepareResult.result);
4949            // nothing much to do. prepare flush failed because of some reason.
4950          }
4951          return prepareResult;
4952        } else {
4953          // we already have an active snapshot.
4954          if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
4955            // They define the same flush. Log and continue.
4956            LOG.warn(getRegionInfo().getEncodedName() + " : "
4957                + "Received a flush prepare marker with the same seqId: " +
4958                + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4959                + prepareFlushResult.flushOpSeqId + ". Ignoring");
4960            // ignore
4961          } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
4962            // We received a flush with a smaller seqNum than what we have prepared. We can only
4963            // ignore this prepare flush request.
4964            LOG.warn(getRegionInfo().getEncodedName() + " : "
4965                + "Received a flush prepare marker with a smaller seqId: " +
4966                + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4967                + prepareFlushResult.flushOpSeqId + ". Ignoring");
4968            // ignore
4969          } else {
4970            // We received a flush with a larger seqNum than what we have prepared
4971            LOG.warn(getRegionInfo().getEncodedName() + " : "
4972                + "Received a flush prepare marker with a larger seqId: " +
4973                + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4974                + prepareFlushResult.flushOpSeqId + ". Ignoring");
4975            // We do not have multiple active snapshots in the memstore or a way to merge current
4976            // memstore snapshot with the contents and resnapshot for now. We cannot take
4977            // another snapshot and drop the previous one because that will cause temporary
4978            // data loss in the secondary. So we ignore this for now, deferring the resolution
4979            // to happen when we see the corresponding flush commit marker. If we have a memstore
4980            // snapshot with x, and later received another prepare snapshot with y (where x < y),
4981            // when we see flush commit for y, we will drop snapshot for x, and can also drop all
4982            // the memstore edits if everything in memstore is < y. This is the usual case for
4983            // RS crash + recovery where we might see consequtive prepare flush wal markers.
4984            // Otherwise, this will cause more memory to be used in secondary replica until a
4985            // further prapare + commit flush is seen and replayed.
4986          }
4987        }
4988      } finally {
4989        status.cleanup();
4990        writestate.notifyAll();
4991      }
4992    }
4993    return null;
4994  }
4995
4996  @VisibleForTesting
4997  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4998    justification="Intentional; post memstore flush")
4999  void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
5000    MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
5001
5002    // check whether we have the memstore snapshot with the corresponding seqId. Replay to
5003    // secondary region replicas are in order, except for when the region moves or then the
5004    // region server crashes. In those cases, we may receive replay requests out of order from
5005    // the original seqIds.
5006    synchronized (writestate) {
5007      try {
5008        if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
5009          LOG.warn(getRegionInfo().getEncodedName() + " : "
5010            + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
5011            + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
5012            + " of " + lastReplayedOpenRegionSeqId);
5013          return;
5014        }
5015
5016        if (writestate.flushing) {
5017          PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
5018          if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
5019            if (LOG.isDebugEnabled()) {
5020              LOG.debug(getRegionInfo().getEncodedName() + " : "
5021                  + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
5022                  + " and a previous prepared snapshot was found");
5023            }
5024            // This is the regular case where we received commit flush after prepare flush
5025            // corresponding to the same seqId.
5026            replayFlushInStores(flush, prepareFlushResult, true);
5027
5028            // Set down the memstore size by amount of flush.
5029            this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize());
5030            this.prepareFlushResult = null;
5031            writestate.flushing = false;
5032          } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
5033            // This should not happen normally. However, lets be safe and guard against these cases
5034            // we received a flush commit with a smaller seqId than what we have prepared
5035            // we will pick the flush file up from this commit (if we have not seen it), but we
5036            // will not drop the memstore
5037            LOG.warn(getRegionInfo().getEncodedName() + " : "
5038                + "Received a flush commit marker with smaller seqId: "
5039                + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
5040                + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
5041                +"  prepared memstore snapshot");
5042            replayFlushInStores(flush, prepareFlushResult, false);
5043
5044            // snapshot is not dropped, so memstore sizes should not be decremented
5045            // we still have the prepared snapshot, flushing should still be true
5046          } else {
5047            // This should not happen normally. However, lets be safe and guard against these cases
5048            // we received a flush commit with a larger seqId than what we have prepared
5049            // we will pick the flush file for this. We will also obtain the updates lock and
5050            // look for contents of the memstore to see whether we have edits after this seqId.
5051            // If not, we will drop all the memstore edits and the snapshot as well.
5052            LOG.warn(getRegionInfo().getEncodedName() + " : "
5053                + "Received a flush commit marker with larger seqId: "
5054                + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
5055                prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
5056                +" memstore snapshot");
5057
5058            replayFlushInStores(flush, prepareFlushResult, true);
5059
5060            // Set down the memstore size by amount of flush.
5061            this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize());
5062
5063            // Inspect the memstore contents to see whether the memstore contains only edits
5064            // with seqId smaller than the flush seqId. If so, we can discard those edits.
5065            dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
5066
5067            this.prepareFlushResult = null;
5068            writestate.flushing = false;
5069          }
5070          // If we were waiting for observing a flush or region opening event for not showing
5071          // partial data after a secondary region crash, we can allow reads now. We can only make
5072          // sure that we are not showing partial data (for example skipping some previous edits)
5073          // until we observe a full flush start and flush commit. So if we were not able to find
5074          // a previous flush we will not enable reads now.
5075          this.setReadsEnabled(true);
5076        } else {
5077          LOG.warn(getRegionInfo().getEncodedName() + " : "
5078              + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
5079              + ", but no previous prepared snapshot was found");
5080          // There is no corresponding prepare snapshot from before.
5081          // We will pick up the new flushed file
5082          replayFlushInStores(flush, null, false);
5083
5084          // Inspect the memstore contents to see whether the memstore contains only edits
5085          // with seqId smaller than the flush seqId. If so, we can discard those edits.
5086          dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
5087        }
5088
5089        status.markComplete("Flush commit successful");
5090
5091        // Update the last flushed sequence id for region.
5092        this.maxFlushedSeqId = flush.getFlushSequenceNumber();
5093
5094        // advance the mvcc read point so that the new flushed file is visible.
5095        mvcc.advanceTo(flush.getFlushSequenceNumber());
5096
5097      } catch (FileNotFoundException ex) {
5098        LOG.warn(getRegionInfo().getEncodedName() + " : "
5099            + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
5100            + " doesn't exist any more. Skip loading the file(s)", ex);
5101      }
5102      finally {
5103        status.cleanup();
5104        writestate.notifyAll();
5105      }
5106    }
5107
5108    // C. Finally notify anyone waiting on memstore to clear:
5109    // e.g. checkResources().
5110    synchronized (this) {
5111      notifyAll(); // FindBugs NN_NAKED_NOTIFY
5112    }
5113  }
5114
5115  /**
5116   * Replays the given flush descriptor by opening the flush files in stores and dropping the
5117   * memstore snapshots if requested.
5118   * @param flush
5119   * @param prepareFlushResult
5120   * @param dropMemstoreSnapshot
5121   * @throws IOException
5122   */
5123  private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
5124      boolean dropMemstoreSnapshot)
5125      throws IOException {
5126    for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
5127      byte[] family = storeFlush.getFamilyName().toByteArray();
5128      HStore store = getStore(family);
5129      if (store == null) {
5130        LOG.warn(getRegionInfo().getEncodedName() + " : "
5131            + "Received a flush commit marker from primary, but the family is not found."
5132            + "Ignoring StoreFlushDescriptor:" + storeFlush);
5133        continue;
5134      }
5135      List<String> flushFiles = storeFlush.getFlushOutputList();
5136      StoreFlushContext ctx = null;
5137      long startTime = EnvironmentEdgeManager.currentTime();
5138      if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
5139        ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY);
5140      } else {
5141        ctx = prepareFlushResult.storeFlushCtxs.get(family);
5142        startTime = prepareFlushResult.startTime;
5143      }
5144
5145      if (ctx == null) {
5146        LOG.warn(getRegionInfo().getEncodedName() + " : "
5147            + "Unexpected: flush commit marker received from store "
5148            + Bytes.toString(family) + " but no associated flush context. Ignoring");
5149        continue;
5150      }
5151
5152      ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
5153
5154      // Record latest flush time
5155      this.lastStoreFlushTimeMap.put(store, startTime);
5156    }
5157  }
5158
5159  /**
5160   * Be careful, this method will drop all data in the memstore of this region.
5161   * Currently, this method is used to drop memstore to prevent memory leak
5162   * when replaying recovered.edits while opening region.
5163   */
5164  public MemStoreSize dropMemStoreContents() throws IOException {
5165    MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing();
5166    this.updatesLock.writeLock().lock();
5167    try {
5168      for (HStore s : stores.values()) {
5169        MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM);
5170        LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region "
5171                + this.getRegionInfo().getRegionNameAsString()
5172                + " , dropped memstoresize: [" + memStoreSize + " }");
5173        totalFreedSize.incMemStoreSize(memStoreSize);
5174      }
5175      return totalFreedSize.getMemStoreSize();
5176    } finally {
5177      this.updatesLock.writeLock().unlock();
5178    }
5179  }
5180
5181  /**
5182   * Drops the memstore contents after replaying a flush descriptor or region open event replay
5183   * if the memstore edits have seqNums smaller than the given seq id
5184   * @throws IOException
5185   */
5186  private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException {
5187    MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing();
5188    this.updatesLock.writeLock().lock();
5189    try {
5190
5191      long currentSeqId = mvcc.getReadPoint();
5192      if (seqId >= currentSeqId) {
5193        // then we can drop the memstore contents since everything is below this seqId
5194        LOG.info(getRegionInfo().getEncodedName() + " : "
5195            + "Dropping memstore contents as well since replayed flush seqId: "
5196            + seqId + " is greater than current seqId:" + currentSeqId);
5197
5198        // Prepare flush (take a snapshot) and then abort (drop the snapshot)
5199        if (store == null) {
5200          for (HStore s : stores.values()) {
5201            totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId));
5202          }
5203        } else {
5204          totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId));
5205        }
5206      } else {
5207        LOG.info(getRegionInfo().getEncodedName() + " : "
5208            + "Not dropping memstore contents since replayed flush seqId: "
5209            + seqId + " is smaller than current seqId:" + currentSeqId);
5210      }
5211    } finally {
5212      this.updatesLock.writeLock().unlock();
5213    }
5214    return totalFreedSize.getMemStoreSize();
5215  }
5216
5217  private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId)
5218      throws IOException {
5219    MemStoreSize flushableSize = s.getFlushableSize();
5220    this.decrMemStoreSize(flushableSize);
5221    StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY);
5222    ctx.prepare();
5223    ctx.abort();
5224    return flushableSize;
5225  }
5226
5227  private void replayWALFlushAbortMarker(FlushDescriptor flush) {
5228    // nothing to do for now. A flush abort will cause a RS abort which means that the region
5229    // will be opened somewhere else later. We will see the region open event soon, and replaying
5230    // that will drop the snapshot
5231  }
5232
5233  private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
5234    synchronized (writestate) {
5235      if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
5236        LOG.warn(getRegionInfo().getEncodedName() + " : "
5237          + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
5238          + " because its sequence id " + replaySeqId + " is smaller than this regions "
5239          + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
5240        return;
5241      }
5242
5243      // If we were waiting for observing a flush or region opening event for not showing partial
5244      // data after a secondary region crash, we can allow reads now. This event means that the
5245      // primary was not able to flush because memstore is empty when we requested flush. By the
5246      // time we observe this, we are guaranteed to have up to date seqId with our previous
5247      // assignment.
5248      this.setReadsEnabled(true);
5249    }
5250  }
5251
5252  @VisibleForTesting
5253  PrepareFlushResult getPrepareFlushResult() {
5254    return prepareFlushResult;
5255  }
5256
5257  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
5258      justification="Intentional; cleared the memstore")
5259  void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
5260    checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
5261      "RegionEvent marker from WAL ", regionEvent);
5262
5263    startRegionOperation(Operation.REPLAY_EVENT);
5264    try {
5265      if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
5266        return; // if primary nothing to do
5267      }
5268
5269      if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
5270        // nothing to do on REGION_CLOSE for now.
5271        return;
5272      }
5273      if (regionEvent.getEventType() != EventType.REGION_OPEN) {
5274        LOG.warn(getRegionInfo().getEncodedName() + " : "
5275            + "Unknown region event received, ignoring :"
5276            + TextFormat.shortDebugString(regionEvent));
5277        return;
5278      }
5279
5280      if (LOG.isDebugEnabled()) {
5281        LOG.debug(getRegionInfo().getEncodedName() + " : "
5282          + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
5283      }
5284
5285      // we will use writestate as a coarse-grain lock for all the replay events
5286      synchronized (writestate) {
5287        // Replication can deliver events out of order when primary region moves or the region
5288        // server crashes, since there is no coordination between replication of different wal files
5289        // belonging to different region servers. We have to safe guard against this case by using
5290        // region open event's seqid. Since this is the first event that the region puts (after
5291        // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
5292        // smaller than this seqId
5293        if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
5294          this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
5295        } else {
5296          LOG.warn(getRegionInfo().getEncodedName() + " : "
5297            + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
5298            + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
5299            + " of " + lastReplayedOpenRegionSeqId);
5300          return;
5301        }
5302
5303        // region open lists all the files that the region has at the time of the opening. Just pick
5304        // all the files and drop prepared flushes and empty memstores
5305        for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
5306          // stores of primary may be different now
5307          byte[] family = storeDescriptor.getFamilyName().toByteArray();
5308          HStore store = getStore(family);
5309          if (store == null) {
5310            LOG.warn(getRegionInfo().getEncodedName() + " : "
5311                + "Received a region open marker from primary, but the family is not found. "
5312                + "Ignoring. StoreDescriptor:" + storeDescriptor);
5313            continue;
5314          }
5315
5316          long storeSeqId = store.getMaxSequenceId().orElse(0L);
5317          List<String> storeFiles = storeDescriptor.getStoreFileList();
5318          try {
5319            store.refreshStoreFiles(storeFiles); // replace the files with the new ones
5320          } catch (FileNotFoundException ex) {
5321            LOG.warn(getRegionInfo().getEncodedName() + " : "
5322                    + "At least one of the store files: " + storeFiles
5323                    + " doesn't exist any more. Skip loading the file(s)", ex);
5324            continue;
5325          }
5326          if (store.getMaxSequenceId().orElse(0L) != storeSeqId) {
5327            // Record latest flush time if we picked up new files
5328            lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
5329          }
5330
5331          if (writestate.flushing) {
5332            // only drop memstore snapshots if they are smaller than last flush for the store
5333            if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
5334              StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
5335                  null : this.prepareFlushResult.storeFlushCtxs.get(family);
5336              if (ctx != null) {
5337                MemStoreSize mss = store.getFlushableSize();
5338                ctx.abort();
5339                this.decrMemStoreSize(mss);
5340                this.prepareFlushResult.storeFlushCtxs.remove(family);
5341              }
5342            }
5343          }
5344
5345          // Drop the memstore contents if they are now smaller than the latest seen flushed file
5346          dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
5347          if (storeSeqId > this.maxFlushedSeqId) {
5348            this.maxFlushedSeqId = storeSeqId;
5349          }
5350        }
5351
5352        // if all stores ended up dropping their snapshots, we can safely drop the
5353        // prepareFlushResult
5354        dropPrepareFlushIfPossible();
5355
5356        // advance the mvcc read point so that the new flushed file is visible.
5357        mvcc.await();
5358
5359        // If we were waiting for observing a flush or region opening event for not showing partial
5360        // data after a secondary region crash, we can allow reads now.
5361        this.setReadsEnabled(true);
5362
5363        // C. Finally notify anyone waiting on memstore to clear:
5364        // e.g. checkResources().
5365        synchronized (this) {
5366          notifyAll(); // FindBugs NN_NAKED_NOTIFY
5367        }
5368      }
5369      logRegionFiles();
5370    } finally {
5371      closeRegionOperation(Operation.REPLAY_EVENT);
5372    }
5373  }
5374
5375  void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
5376    checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
5377      "BulkLoad marker from WAL ", bulkLoadEvent);
5378
5379    if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
5380      return; // if primary nothing to do
5381    }
5382
5383    if (LOG.isDebugEnabled()) {
5384      LOG.debug(getRegionInfo().getEncodedName() + " : "
5385              +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
5386    }
5387    // check if multiple families involved
5388    boolean multipleFamilies = false;
5389    byte[] family = null;
5390    for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
5391      byte[] fam = storeDescriptor.getFamilyName().toByteArray();
5392      if (family == null) {
5393        family = fam;
5394      } else if (!Bytes.equals(family, fam)) {
5395        multipleFamilies = true;
5396        break;
5397      }
5398    }
5399
5400    startBulkRegionOperation(multipleFamilies);
5401    try {
5402      // we will use writestate as a coarse-grain lock for all the replay events
5403      synchronized (writestate) {
5404        // Replication can deliver events out of order when primary region moves or the region
5405        // server crashes, since there is no coordination between replication of different wal files
5406        // belonging to different region servers. We have to safe guard against this case by using
5407        // region open event's seqid. Since this is the first event that the region puts (after
5408        // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
5409        // smaller than this seqId
5410        if (bulkLoadEvent.getBulkloadSeqNum() >= 0
5411            && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
5412          LOG.warn(getRegionInfo().getEncodedName() + " : "
5413              + "Skipping replaying bulkload event :"
5414              + TextFormat.shortDebugString(bulkLoadEvent)
5415              + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
5416              + " =" + lastReplayedOpenRegionSeqId);
5417
5418          return;
5419        }
5420
5421        for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
5422          // stores of primary may be different now
5423          family = storeDescriptor.getFamilyName().toByteArray();
5424          HStore store = getStore(family);
5425          if (store == null) {
5426            LOG.warn(getRegionInfo().getEncodedName() + " : "
5427                    + "Received a bulk load marker from primary, but the family is not found. "
5428                    + "Ignoring. StoreDescriptor:" + storeDescriptor);
5429            continue;
5430          }
5431
5432          List<String> storeFiles = storeDescriptor.getStoreFileList();
5433          for (String storeFile : storeFiles) {
5434            StoreFileInfo storeFileInfo = null;
5435            try {
5436              storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
5437              store.bulkLoadHFile(storeFileInfo);
5438            } catch(FileNotFoundException ex) {
5439              LOG.warn(getRegionInfo().getEncodedName() + " : "
5440                      + ((storeFileInfo != null) ? storeFileInfo.toString() :
5441                            (new Path(Bytes.toString(family), storeFile)).toString())
5442                      + " doesn't exist any more. Skip loading the file");
5443            }
5444          }
5445        }
5446      }
5447      if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
5448        mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum());
5449      }
5450    } finally {
5451      closeBulkRegionOperation();
5452    }
5453  }
5454
5455  /**
5456   * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
5457   */
5458  private void dropPrepareFlushIfPossible() {
5459    if (writestate.flushing) {
5460      boolean canDrop = true;
5461      if (prepareFlushResult.storeFlushCtxs != null) {
5462        for (Entry<byte[], StoreFlushContext> entry : prepareFlushResult.storeFlushCtxs
5463            .entrySet()) {
5464          HStore store = getStore(entry.getKey());
5465          if (store == null) {
5466            continue;
5467          }
5468          if (store.getSnapshotSize().getDataSize() > 0) {
5469            canDrop = false;
5470            break;
5471          }
5472        }
5473      }
5474
5475      // this means that all the stores in the region has finished flushing, but the WAL marker
5476      // may not have been written or we did not receive it yet.
5477      if (canDrop) {
5478        writestate.flushing = false;
5479        this.prepareFlushResult = null;
5480      }
5481    }
5482  }
5483
5484  @Override
5485  public boolean refreshStoreFiles() throws IOException {
5486    return refreshStoreFiles(false);
5487  }
5488
5489  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY",
5490      justification = "Notify is about post replay. Intentional")
5491  protected boolean refreshStoreFiles(boolean force) throws IOException {
5492    if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
5493      return false; // if primary nothing to do
5494    }
5495
5496    if (LOG.isDebugEnabled()) {
5497      LOG.debug(getRegionInfo().getEncodedName() + " : "
5498          + "Refreshing store files to see whether we can free up memstore");
5499    }
5500
5501    long totalFreedDataSize = 0;
5502
5503    long smallestSeqIdInStores = Long.MAX_VALUE;
5504
5505    startRegionOperation(); // obtain region close lock
5506    try {
5507      Map<HStore, Long> map = new HashMap<>();
5508      synchronized (writestate) {
5509        for (HStore store : stores.values()) {
5510          // TODO: some stores might see new data from flush, while others do not which
5511          // MIGHT break atomic edits across column families.
5512          long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L);
5513
5514          // refresh the store files. This is similar to observing a region open wal marker.
5515          store.refreshStoreFiles();
5516
5517          long storeSeqId = store.getMaxSequenceId().orElse(0L);
5518          if (storeSeqId < smallestSeqIdInStores) {
5519            smallestSeqIdInStores = storeSeqId;
5520          }
5521
5522          // see whether we can drop the memstore or the snapshot
5523          if (storeSeqId > maxSeqIdBefore) {
5524            if (writestate.flushing) {
5525              // only drop memstore snapshots if they are smaller than last flush for the store
5526              if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
5527                StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
5528                    null : this.prepareFlushResult.storeFlushCtxs.get(
5529                            store.getColumnFamilyDescriptor().getName());
5530                if (ctx != null) {
5531                  MemStoreSize mss = store.getFlushableSize();
5532                  ctx.abort();
5533                  this.decrMemStoreSize(mss);
5534                  this.prepareFlushResult.storeFlushCtxs.
5535                      remove(store.getColumnFamilyDescriptor().getName());
5536                  totalFreedDataSize += mss.getDataSize();
5537                }
5538              }
5539            }
5540
5541            map.put(store, storeSeqId);
5542          }
5543        }
5544
5545        // if all stores ended up dropping their snapshots, we can safely drop the
5546        // prepareFlushResult
5547        dropPrepareFlushIfPossible();
5548
5549        // advance the mvcc read point so that the new flushed files are visible.
5550        // either greater than flush seq number or they were already picked up via flush.
5551        for (HStore s : stores.values()) {
5552          mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L));
5553        }
5554
5555
5556        // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
5557        // skip all edits that are to be replayed in the future with that has a smaller seqId
5558        // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
5559        // that we have picked the flush files for
5560        if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
5561          this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
5562        }
5563      }
5564      if (!map.isEmpty()) {
5565        for (Map.Entry<HStore, Long> entry : map.entrySet()) {
5566          // Drop the memstore contents if they are now smaller than the latest seen flushed file
5567          totalFreedDataSize += dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey())
5568              .getDataSize();
5569        }
5570      }
5571      // C. Finally notify anyone waiting on memstore to clear:
5572      // e.g. checkResources().
5573      synchronized (this) {
5574        notifyAll(); // FindBugs NN_NAKED_NOTIFY
5575      }
5576      return totalFreedDataSize > 0;
5577    } finally {
5578      closeRegionOperation();
5579    }
5580  }
5581
5582  private void logRegionFiles() {
5583    if (LOG.isTraceEnabled()) {
5584      LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
5585      stores.values().stream().filter(s -> s.getStorefiles() != null)
5586          .flatMap(s -> s.getStorefiles().stream())
5587          .forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf));
5588    }
5589  }
5590
5591  /** Checks whether the given regionName is either equal to our region, or that
5592   * the regionName is the primary region to our corresponding range for the secondary replica.
5593   */
5594  private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
5595      throws WrongRegionException {
5596    if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
5597      return;
5598    }
5599
5600    if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
5601        Bytes.equals(encodedRegionName,
5602          this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
5603      return;
5604    }
5605
5606    throw new WrongRegionException(exceptionMsg + payload
5607      + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
5608      + " does not match this region: " + this.getRegionInfo());
5609  }
5610
5611  /**
5612   * Used by tests
5613   * @param s Store to add edit too.
5614   * @param cell Cell to add.
5615   */
5616  @VisibleForTesting
5617  protected void restoreEdit(HStore s, Cell cell, MemStoreSizing memstoreAccounting) {
5618    s.add(cell, memstoreAccounting);
5619  }
5620
5621  /**
5622   * @param p File to check.
5623   * @return True if file was zero-length (and if so, we'll delete it in here).
5624   * @throws IOException
5625   */
5626  private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
5627      throws IOException {
5628    FileStatus stat = fs.getFileStatus(p);
5629    if (stat.getLen() > 0) {
5630      return false;
5631    }
5632    LOG.warn("File " + p + " is zero-length, deleting.");
5633    fs.delete(p, false);
5634    return true;
5635  }
5636
5637  protected HStore instantiateHStore(final ColumnFamilyDescriptor family) throws IOException {
5638    if (family.isMobEnabled()) {
5639      if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
5640        throw new IOException("A minimum HFile version of "
5641            + HFile.MIN_FORMAT_VERSION_WITH_TAGS
5642            + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY
5643            + " accordingly.");
5644      }
5645      return new HMobStore(this, family, this.conf);
5646    }
5647    return new HStore(this, family, this.conf);
5648  }
5649
5650  @Override
5651  public HStore getStore(byte[] column) {
5652    return this.stores.get(column);
5653  }
5654
5655  /**
5656   * Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on
5657   * the list.
5658   */
5659  private HStore getStore(Cell cell) {
5660    return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey()))
5661        .map(e -> e.getValue()).findFirst().orElse(null);
5662  }
5663
5664  @Override
5665  public List<HStore> getStores() {
5666    return new ArrayList<>(stores.values());
5667  }
5668
5669  @Override
5670  public List<String> getStoreFileList(byte[][] columns) throws IllegalArgumentException {
5671    List<String> storeFileNames = new ArrayList<>();
5672    synchronized (closeLock) {
5673      for (byte[] column : columns) {
5674        HStore store = this.stores.get(column);
5675        if (store == null) {
5676          throw new IllegalArgumentException(
5677              "No column family : " + new String(column, StandardCharsets.UTF_8) + " available");
5678        }
5679        Collection<HStoreFile> storeFiles = store.getStorefiles();
5680        if (storeFiles == null) {
5681          continue;
5682        }
5683        for (HStoreFile storeFile : storeFiles) {
5684          storeFileNames.add(storeFile.getPath().toString());
5685        }
5686
5687        logRegionFiles();
5688      }
5689    }
5690    return storeFileNames;
5691  }
5692
5693  //////////////////////////////////////////////////////////////////////////////
5694  // Support code
5695  //////////////////////////////////////////////////////////////////////////////
5696
5697  /** Make sure this is a valid row for the HRegion */
5698  void checkRow(byte[] row, String op) throws IOException {
5699    if (!rowIsInRange(getRegionInfo(), row)) {
5700      throw new WrongRegionException("Requested row out of range for " +
5701          op + " on HRegion " + this + ", startKey='" +
5702          Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" +
5703          Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" +
5704          Bytes.toStringBinary(row) + "'");
5705    }
5706  }
5707
5708
5709  /**
5710   * Get an exclusive ( write lock ) lock on a given row.
5711   * @param row Which row to lock.
5712   * @return A locked RowLock. The lock is exclusive and already aqquired.
5713   * @throws IOException
5714   */
5715  public RowLock getRowLock(byte[] row) throws IOException {
5716    return getRowLock(row, false);
5717  }
5718
5719  @Override
5720  public RowLock getRowLock(byte[] row, boolean readLock) throws IOException {
5721    checkRow(row, "row lock");
5722    return getRowLockInternal(row, readLock, null);
5723  }
5724
5725  protected RowLock getRowLockInternal(byte[] row, boolean readLock, final RowLock prevRowLock)
5726      throws IOException {
5727    // create an object to use a a key in the row lock map
5728    HashedBytes rowKey = new HashedBytes(row);
5729
5730    RowLockContext rowLockContext = null;
5731    RowLockImpl result = null;
5732
5733    boolean success = false;
5734    try (TraceScope scope = TraceUtil.createTrace("HRegion.getRowLock")) {
5735      TraceUtil.addTimelineAnnotation("Getting a " + (readLock?"readLock":"writeLock"));
5736      // Keep trying until we have a lock or error out.
5737      // TODO: do we need to add a time component here?
5738      while (result == null) {
5739        rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey));
5740        // Now try an get the lock.
5741        // This can fail as
5742        if (readLock) {
5743          // For read lock, if the caller has locked the same row previously, it will not try
5744          // to acquire the same read lock. It simply returns the previous row lock.
5745          RowLockImpl prevRowLockImpl = (RowLockImpl)prevRowLock;
5746          if ((prevRowLockImpl != null) && (prevRowLockImpl.getLock() ==
5747              rowLockContext.readWriteLock.readLock())) {
5748            success = true;
5749            return prevRowLock;
5750          }
5751          result = rowLockContext.newReadLock();
5752        } else {
5753          result = rowLockContext.newWriteLock();
5754        }
5755      }
5756
5757      int timeout = rowLockWaitDuration;
5758      boolean reachDeadlineFirst = false;
5759      Optional<RpcCall> call = RpcServer.getCurrentCall();
5760      if (call.isPresent()) {
5761        long deadline = call.get().getDeadline();
5762        if (deadline < Long.MAX_VALUE) {
5763          int timeToDeadline = (int) (deadline - System.currentTimeMillis());
5764          if (timeToDeadline <= this.rowLockWaitDuration) {
5765            reachDeadlineFirst = true;
5766            timeout = timeToDeadline;
5767          }
5768        }
5769      }
5770
5771      if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) {
5772        TraceUtil.addTimelineAnnotation("Failed to get row lock");
5773        String message = "Timed out waiting for lock for row: " + rowKey + " in region "
5774            + getRegionInfo().getEncodedName();
5775        if (reachDeadlineFirst) {
5776          throw new TimeoutIOException(message);
5777        } else {
5778          // If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request.
5779          throw new IOException(message);
5780        }
5781      }
5782      rowLockContext.setThreadName(Thread.currentThread().getName());
5783      success = true;
5784      return result;
5785    } catch (InterruptedException ie) {
5786      LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
5787      InterruptedIOException iie = new InterruptedIOException();
5788      iie.initCause(ie);
5789      TraceUtil.addTimelineAnnotation("Interrupted exception getting row lock");
5790      Thread.currentThread().interrupt();
5791      throw iie;
5792    } catch (Error error) {
5793      // The maximum lock count for read lock is 64K (hardcoded), when this maximum count
5794      // is reached, it will throw out an Error. This Error needs to be caught so it can
5795      // go ahead to process the minibatch with lock acquired.
5796      LOG.warn("Error to get row lock for " + Bytes.toStringBinary(row) + ", cause: " + error);
5797      IOException ioe = new IOException();
5798      ioe.initCause(error);
5799      TraceUtil.addTimelineAnnotation("Error getting row lock");
5800      throw ioe;
5801    } finally {
5802      // Clean up the counts just in case this was the thing keeping the context alive.
5803      if (!success && rowLockContext != null) {
5804        rowLockContext.cleanUp();
5805      }
5806    }
5807  }
5808
5809  private void releaseRowLocks(List<RowLock> rowLocks) {
5810    if (rowLocks != null) {
5811      for (RowLock rowLock : rowLocks) {
5812        rowLock.release();
5813      }
5814      rowLocks.clear();
5815    }
5816  }
5817
5818  @VisibleForTesting
5819  public int getReadLockCount() {
5820    return lock.getReadLockCount();
5821  }
5822
5823  public ConcurrentHashMap<HashedBytes, RowLockContext> getLockedRows() {
5824    return lockedRows;
5825  }
5826
5827  @VisibleForTesting
5828  class RowLockContext {
5829    private final HashedBytes row;
5830    final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true);
5831    final AtomicBoolean usable = new AtomicBoolean(true);
5832    final AtomicInteger count = new AtomicInteger(0);
5833    final Object lock = new Object();
5834    private String threadName;
5835
5836    RowLockContext(HashedBytes row) {
5837      this.row = row;
5838    }
5839
5840    RowLockImpl newWriteLock() {
5841      Lock l = readWriteLock.writeLock();
5842      return getRowLock(l);
5843    }
5844    RowLockImpl newReadLock() {
5845      Lock l = readWriteLock.readLock();
5846      return getRowLock(l);
5847    }
5848
5849    private RowLockImpl getRowLock(Lock l) {
5850      count.incrementAndGet();
5851      synchronized (lock) {
5852        if (usable.get()) {
5853          return new RowLockImpl(this, l);
5854        } else {
5855          return null;
5856        }
5857      }
5858    }
5859
5860    void cleanUp() {
5861      long c = count.decrementAndGet();
5862      if (c <= 0) {
5863        synchronized (lock) {
5864          if (count.get() <= 0 && usable.get()){ // Don't attempt to remove row if already removed
5865            usable.set(false);
5866            RowLockContext removed = lockedRows.remove(row);
5867            assert removed == this: "we should never remove a different context";
5868          }
5869        }
5870      }
5871    }
5872
5873    public void setThreadName(String threadName) {
5874      this.threadName = threadName;
5875    }
5876
5877    @Override
5878    public String toString() {
5879      return "RowLockContext{" +
5880          "row=" + row +
5881          ", readWriteLock=" + readWriteLock +
5882          ", count=" + count +
5883          ", threadName=" + threadName +
5884          '}';
5885    }
5886  }
5887
5888  /**
5889   * Class used to represent a lock on a row.
5890   */
5891  public static class RowLockImpl implements RowLock {
5892    private final RowLockContext context;
5893    private final Lock lock;
5894
5895    public RowLockImpl(RowLockContext context, Lock lock) {
5896      this.context = context;
5897      this.lock = lock;
5898    }
5899
5900    public Lock getLock() {
5901      return lock;
5902    }
5903
5904    @VisibleForTesting
5905    public RowLockContext getContext() {
5906      return context;
5907    }
5908
5909    @Override
5910    public void release() {
5911      lock.unlock();
5912      context.cleanUp();
5913    }
5914
5915    @Override
5916    public String toString() {
5917      return "RowLockImpl{" +
5918          "context=" + context +
5919          ", lock=" + lock +
5920          '}';
5921    }
5922  }
5923
5924  /**
5925   * Determines whether multiple column families are present
5926   * Precondition: familyPaths is not null
5927   *
5928   * @param familyPaths List of (column family, hfilePath)
5929   */
5930  private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) {
5931    boolean multipleFamilies = false;
5932    byte[] family = null;
5933    for (Pair<byte[], String> pair : familyPaths) {
5934      byte[] fam = pair.getFirst();
5935      if (family == null) {
5936        family = fam;
5937      } else if (!Bytes.equals(family, fam)) {
5938        multipleFamilies = true;
5939        break;
5940      }
5941    }
5942    return multipleFamilies;
5943  }
5944
5945  /**
5946   * Attempts to atomically load a group of hfiles.  This is critical for loading
5947   * rows with multiple column families atomically.
5948   *
5949   * @param familyPaths List of Pair&lt;byte[] column family, String hfilePath&gt;
5950   * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
5951   * file about to be bulk loaded
5952   * @param assignSeqId
5953   * @return Map from family to List of store file paths if successful, null if failed recoverably
5954   * @throws IOException if failed unrecoverably.
5955   */
5956  public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, boolean assignSeqId,
5957      BulkLoadListener bulkLoadListener) throws IOException {
5958    return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false);
5959  }
5960
5961  /**
5962   * Listener class to enable callers of
5963   * bulkLoadHFile() to perform any necessary
5964   * pre/post processing of a given bulkload call
5965   */
5966  public interface BulkLoadListener {
5967    /**
5968     * Called before an HFile is actually loaded
5969     * @param family family being loaded to
5970     * @param srcPath path of HFile
5971     * @return final path to be used for actual loading
5972     * @throws IOException
5973     */
5974    String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile)
5975        throws IOException;
5976
5977    /**
5978     * Called after a successful HFile load
5979     * @param family family being loaded to
5980     * @param srcPath path of HFile
5981     * @throws IOException
5982     */
5983    void doneBulkLoad(byte[] family, String srcPath) throws IOException;
5984
5985    /**
5986     * Called after a failed HFile load
5987     * @param family family being loaded to
5988     * @param srcPath path of HFile
5989     * @throws IOException
5990     */
5991    void failedBulkLoad(byte[] family, String srcPath) throws IOException;
5992  }
5993
5994  /**
5995   * Attempts to atomically load a group of hfiles.  This is critical for loading
5996   * rows with multiple column families atomically.
5997   *
5998   * @param familyPaths List of Pair&lt;byte[] column family, String hfilePath&gt;
5999   * @param assignSeqId
6000   * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
6001   * file about to be bulk loaded
6002   * @param copyFile always copy hfiles if true
6003   * @return Map from family to List of store file paths if successful, null if failed recoverably
6004   * @throws IOException if failed unrecoverably.
6005   */
6006  public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths,
6007      boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile) throws IOException {
6008    long seqId = -1;
6009    Map<byte[], List<Path>> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR);
6010    Map<String, Long> storeFilesSizes = new HashMap<>();
6011    Preconditions.checkNotNull(familyPaths);
6012    // we need writeLock for multi-family bulk load
6013    startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
6014    boolean isSuccessful = false;
6015    try {
6016      this.writeRequestsCount.increment();
6017
6018      // There possibly was a split that happened between when the split keys
6019      // were gathered and before the HRegion's write lock was taken.  We need
6020      // to validate the HFile region before attempting to bulk load all of them
6021      List<IOException> ioes = new ArrayList<>();
6022      List<Pair<byte[], String>> failures = new ArrayList<>();
6023      for (Pair<byte[], String> p : familyPaths) {
6024        byte[] familyName = p.getFirst();
6025        String path = p.getSecond();
6026
6027        HStore store = getStore(familyName);
6028        if (store == null) {
6029          IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
6030              "No such column family " + Bytes.toStringBinary(familyName));
6031          ioes.add(ioe);
6032        } else {
6033          try {
6034            store.assertBulkLoadHFileOk(new Path(path));
6035          } catch (WrongRegionException wre) {
6036            // recoverable (file doesn't fit in region)
6037            failures.add(p);
6038          } catch (IOException ioe) {
6039            // unrecoverable (hdfs problem)
6040            ioes.add(ioe);
6041          }
6042        }
6043      }
6044
6045      // validation failed because of some sort of IO problem.
6046      if (ioes.size() != 0) {
6047        IOException e = MultipleIOException.createIOException(ioes);
6048        LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
6049        throw e;
6050      }
6051
6052      // validation failed, bail out before doing anything permanent.
6053      if (failures.size() != 0) {
6054        StringBuilder list = new StringBuilder();
6055        for (Pair<byte[], String> p : failures) {
6056          list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
6057              .append(p.getSecond());
6058        }
6059        // problem when validating
6060        LOG.warn("There was a recoverable bulk load failure likely due to a" +
6061            " split.  These (family, HFile) pairs were not loaded: " + list);
6062        return null;
6063      }
6064
6065      // We need to assign a sequential ID that's in between two memstores in order to preserve
6066      // the guarantee that all the edits lower than the highest sequential ID from all the
6067      // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
6068      // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
6069      // a sequence id that we can be sure is beyond the last hfile written).
6070      if (assignSeqId) {
6071        FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY);
6072        if (fs.isFlushSucceeded()) {
6073          seqId = ((FlushResultImpl)fs).flushSequenceId;
6074        } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
6075          seqId = ((FlushResultImpl)fs).flushSequenceId;
6076        } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) {
6077          // CANNOT_FLUSH may mean that a flush is already on-going
6078          // we need to wait for that flush to complete
6079          waitForFlushes();
6080        } else {
6081          throw new IOException("Could not bulk load with an assigned sequential ID because the "+
6082            "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason);
6083        }
6084      }
6085
6086      Map<byte[], List<Pair<Path, Path>>> familyWithFinalPath =
6087          new TreeMap<>(Bytes.BYTES_COMPARATOR);
6088      for (Pair<byte[], String> p : familyPaths) {
6089        byte[] familyName = p.getFirst();
6090        String path = p.getSecond();
6091        HStore store = getStore(familyName);
6092        if (!familyWithFinalPath.containsKey(familyName)) {
6093          familyWithFinalPath.put(familyName, new ArrayList<>());
6094        }
6095        List<Pair<Path, Path>> lst = familyWithFinalPath.get(familyName);
6096        try {
6097          String finalPath = path;
6098          if (bulkLoadListener != null) {
6099            finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile);
6100          }
6101          Pair<Path, Path> pair = store.preBulkLoadHFile(finalPath, seqId);
6102          lst.add(pair);
6103        } catch (IOException ioe) {
6104          // A failure here can cause an atomicity violation that we currently
6105          // cannot recover from since it is likely a failed HDFS operation.
6106
6107          LOG.error("There was a partial failure due to IO when attempting to" +
6108              " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
6109          if (bulkLoadListener != null) {
6110            try {
6111              bulkLoadListener.failedBulkLoad(familyName, path);
6112            } catch (Exception ex) {
6113              LOG.error("Error while calling failedBulkLoad for family " +
6114                  Bytes.toString(familyName) + " with path " + path, ex);
6115            }
6116          }
6117          throw ioe;
6118        }
6119      }
6120
6121      if (this.getCoprocessorHost() != null) {
6122        for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) {
6123          this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue());
6124        }
6125      }
6126      for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) {
6127        byte[] familyName = entry.getKey();
6128        for (Pair<Path, Path> p : entry.getValue()) {
6129          String path = p.getFirst().toString();
6130          Path commitedStoreFile = p.getSecond();
6131          HStore store = getStore(familyName);
6132          try {
6133            store.bulkLoadHFile(familyName, path, commitedStoreFile);
6134            // Note the size of the store file
6135            try {
6136              FileSystem fs = commitedStoreFile.getFileSystem(baseConf);
6137              storeFilesSizes.put(commitedStoreFile.getName(), fs.getFileStatus(commitedStoreFile)
6138                  .getLen());
6139            } catch (IOException e) {
6140              LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e);
6141              storeFilesSizes.put(commitedStoreFile.getName(), 0L);
6142            }
6143
6144            if(storeFiles.containsKey(familyName)) {
6145              storeFiles.get(familyName).add(commitedStoreFile);
6146            } else {
6147              List<Path> storeFileNames = new ArrayList<>();
6148              storeFileNames.add(commitedStoreFile);
6149              storeFiles.put(familyName, storeFileNames);
6150            }
6151            if (bulkLoadListener != null) {
6152              bulkLoadListener.doneBulkLoad(familyName, path);
6153            }
6154          } catch (IOException ioe) {
6155            // A failure here can cause an atomicity violation that we currently
6156            // cannot recover from since it is likely a failed HDFS operation.
6157
6158            // TODO Need a better story for reverting partial failures due to HDFS.
6159            LOG.error("There was a partial failure due to IO when attempting to" +
6160                " load " + Bytes.toString(familyName) + " : " + p.getSecond(), ioe);
6161            if (bulkLoadListener != null) {
6162              try {
6163                bulkLoadListener.failedBulkLoad(familyName, path);
6164              } catch (Exception ex) {
6165                LOG.error("Error while calling failedBulkLoad for family " +
6166                    Bytes.toString(familyName) + " with path " + path, ex);
6167              }
6168            }
6169            throw ioe;
6170          }
6171        }
6172      }
6173
6174      isSuccessful = true;
6175    } finally {
6176      if (wal != null && !storeFiles.isEmpty()) {
6177        // Write a bulk load event for hfiles that are loaded
6178        try {
6179          WALProtos.BulkLoadDescriptor loadDescriptor =
6180              ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(),
6181                  UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()),
6182                  storeFiles,
6183                storeFilesSizes, seqId);
6184          WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(),
6185              loadDescriptor, mvcc);
6186        } catch (IOException ioe) {
6187          if (this.rsServices != null) {
6188            // Have to abort region server because some hfiles has been loaded but we can't write
6189            // the event into WAL
6190            isSuccessful = false;
6191            this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
6192          }
6193        }
6194      }
6195
6196      closeBulkRegionOperation();
6197    }
6198    return isSuccessful ? storeFiles : null;
6199  }
6200
6201  @Override
6202  public boolean equals(Object o) {
6203    return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
6204                                                ((HRegion) o).getRegionInfo().getRegionName());
6205  }
6206
6207  @Override
6208  public int hashCode() {
6209    return Bytes.hashCode(getRegionInfo().getRegionName());
6210  }
6211
6212  @Override
6213  public String toString() {
6214    return getRegionInfo().getRegionNameAsString();
6215  }
6216
6217  /**
6218   * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
6219   */
6220  class RegionScannerImpl
6221      implements RegionScanner, Shipper, org.apache.hadoop.hbase.ipc.RpcCallback {
6222    // Package local for testability
6223    KeyValueHeap storeHeap = null;
6224    /** Heap of key-values that are not essential for the provided filters and are thus read
6225     * on demand, if on-demand column family loading is enabled.*/
6226    KeyValueHeap joinedHeap = null;
6227    /**
6228     * If the joined heap data gathering is interrupted due to scan limits, this will
6229     * contain the row for which we are populating the values.*/
6230    protected Cell joinedContinuationRow = null;
6231    private boolean filterClosed = false;
6232
6233    protected final byte[] stopRow;
6234    protected final boolean includeStopRow;
6235    protected final HRegion region;
6236    protected final CellComparator comparator;
6237
6238    private final long readPt;
6239    private final long maxResultSize;
6240    private final ScannerContext defaultScannerContext;
6241    private final FilterWrapper filter;
6242
6243    @Override
6244    public RegionInfo getRegionInfo() {
6245      return region.getRegionInfo();
6246    }
6247
6248    RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
6249        throws IOException {
6250      this(scan, additionalScanners, region, HConstants.NO_NONCE, HConstants.NO_NONCE);
6251    }
6252
6253    RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region,
6254        long nonceGroup, long nonce) throws IOException {
6255      this.region = region;
6256      this.maxResultSize = scan.getMaxResultSize();
6257      if (scan.hasFilter()) {
6258        this.filter = new FilterWrapper(scan.getFilter());
6259      } else {
6260        this.filter = null;
6261      }
6262      this.comparator = region.getCellComparator();
6263      /**
6264       * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default
6265       * scanner context that can be used to enforce the batch limit in the event that a
6266       * ScannerContext is not specified during an invocation of next/nextRaw
6267       */
6268      defaultScannerContext = ScannerContext.newBuilder()
6269          .setBatchLimit(scan.getBatch()).build();
6270      this.stopRow = scan.getStopRow();
6271      this.includeStopRow = scan.includeStopRow();
6272
6273      // synchronize on scannerReadPoints so that nobody calculates
6274      // getSmallestReadPoint, before scannerReadPoints is updated.
6275      IsolationLevel isolationLevel = scan.getIsolationLevel();
6276      long mvccReadPoint = PackagePrivateFieldAccessor.getMvccReadPoint(scan);
6277      synchronized (scannerReadPoints) {
6278        if (mvccReadPoint > 0) {
6279          this.readPt = mvccReadPoint;
6280        } else if (nonce == HConstants.NO_NONCE || rsServices == null
6281            || rsServices.getNonceManager() == null) {
6282          this.readPt = getReadPoint(isolationLevel);
6283        } else {
6284          this.readPt = rsServices.getNonceManager().getMvccFromOperationContext(nonceGroup, nonce);
6285        }
6286        scannerReadPoints.put(this, this.readPt);
6287      }
6288      initializeScanners(scan, additionalScanners);
6289    }
6290
6291    protected void initializeScanners(Scan scan, List<KeyValueScanner> additionalScanners)
6292        throws IOException {
6293      // Here we separate all scanners into two lists - scanner that provide data required
6294      // by the filter to operate (scanners list) and all others (joinedScanners list).
6295      List<KeyValueScanner> scanners = new ArrayList<>(scan.getFamilyMap().size());
6296      List<KeyValueScanner> joinedScanners = new ArrayList<>(scan.getFamilyMap().size());
6297      // Store all already instantiated scanners for exception handling
6298      List<KeyValueScanner> instantiatedScanners = new ArrayList<>();
6299      // handle additionalScanners
6300      if (additionalScanners != null && !additionalScanners.isEmpty()) {
6301        scanners.addAll(additionalScanners);
6302        instantiatedScanners.addAll(additionalScanners);
6303      }
6304
6305      try {
6306        for (Map.Entry<byte[], NavigableSet<byte[]>> entry : scan.getFamilyMap().entrySet()) {
6307          HStore store = stores.get(entry.getKey());
6308          KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt);
6309          instantiatedScanners.add(scanner);
6310          if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
6311              || this.filter.isFamilyEssential(entry.getKey())) {
6312            scanners.add(scanner);
6313          } else {
6314            joinedScanners.add(scanner);
6315          }
6316        }
6317        initializeKVHeap(scanners, joinedScanners, region);
6318      } catch (Throwable t) {
6319        throw handleException(instantiatedScanners, t);
6320      }
6321    }
6322
6323    protected void initializeKVHeap(List<KeyValueScanner> scanners,
6324        List<KeyValueScanner> joinedScanners, HRegion region)
6325        throws IOException {
6326      this.storeHeap = new KeyValueHeap(scanners, comparator);
6327      if (!joinedScanners.isEmpty()) {
6328        this.joinedHeap = new KeyValueHeap(joinedScanners, comparator);
6329      }
6330    }
6331
6332    private IOException handleException(List<KeyValueScanner> instantiatedScanners,
6333        Throwable t) {
6334      // remove scaner read point before throw the exception
6335      scannerReadPoints.remove(this);
6336      if (storeHeap != null) {
6337        storeHeap.close();
6338        storeHeap = null;
6339        if (joinedHeap != null) {
6340          joinedHeap.close();
6341          joinedHeap = null;
6342        }
6343      } else {
6344        // close all already instantiated scanners before throwing the exception
6345        for (KeyValueScanner scanner : instantiatedScanners) {
6346          scanner.close();
6347        }
6348      }
6349      return t instanceof IOException ? (IOException) t : new IOException(t);
6350    }
6351
6352    @Override
6353    public long getMaxResultSize() {
6354      return maxResultSize;
6355    }
6356
6357    @Override
6358    public long getMvccReadPoint() {
6359      return this.readPt;
6360    }
6361
6362    @Override
6363    public int getBatch() {
6364      return this.defaultScannerContext.getBatchLimit();
6365    }
6366
6367    /**
6368     * Reset both the filter and the old filter.
6369     *
6370     * @throws IOException in case a filter raises an I/O exception.
6371     */
6372    protected void resetFilters() throws IOException {
6373      if (filter != null) {
6374        filter.reset();
6375      }
6376    }
6377
6378    @Override
6379    public boolean next(List<Cell> outResults)
6380        throws IOException {
6381      // apply the batching limit by default
6382      return next(outResults, defaultScannerContext);
6383    }
6384
6385    @Override
6386    public synchronized boolean next(List<Cell> outResults, ScannerContext scannerContext)
6387    throws IOException {
6388      if (this.filterClosed) {
6389        throw new UnknownScannerException("Scanner was closed (timed out?) " +
6390            "after we renewed it. Could be caused by a very slow scanner " +
6391            "or a lengthy garbage collection");
6392      }
6393      startRegionOperation(Operation.SCAN);
6394      try {
6395        return nextRaw(outResults, scannerContext);
6396      } finally {
6397        closeRegionOperation(Operation.SCAN);
6398      }
6399    }
6400
6401    @Override
6402    public boolean nextRaw(List<Cell> outResults) throws IOException {
6403      // Use the RegionScanner's context by default
6404      return nextRaw(outResults, defaultScannerContext);
6405    }
6406
6407    @Override
6408    public boolean nextRaw(List<Cell> outResults, ScannerContext scannerContext)
6409        throws IOException {
6410      if (storeHeap == null) {
6411        // scanner is closed
6412        throw new UnknownScannerException("Scanner was closed");
6413      }
6414      boolean moreValues = false;
6415      if (outResults.isEmpty()) {
6416        // Usually outResults is empty. This is true when next is called
6417        // to handle scan or get operation.
6418        moreValues = nextInternal(outResults, scannerContext);
6419      } else {
6420        List<Cell> tmpList = new ArrayList<>();
6421        moreValues = nextInternal(tmpList, scannerContext);
6422        outResults.addAll(tmpList);
6423      }
6424
6425      if (!outResults.isEmpty()) {
6426        readRequestsCount.increment();
6427      }
6428
6429      // If the size limit was reached it means a partial Result is being returned. Returning a
6430      // partial Result means that we should not reset the filters; filters should only be reset in
6431      // between rows
6432      if (!scannerContext.mayHaveMoreCellsInRow()) {
6433        resetFilters();
6434      }
6435
6436      if (isFilterDoneInternal()) {
6437        moreValues = false;
6438      }
6439      return moreValues;
6440    }
6441
6442    /**
6443     * @return true if more cells exist after this batch, false if scanner is done
6444     */
6445    private boolean populateFromJoinedHeap(List<Cell> results, ScannerContext scannerContext)
6446            throws IOException {
6447      assert joinedContinuationRow != null;
6448      boolean moreValues = populateResult(results, this.joinedHeap, scannerContext,
6449          joinedContinuationRow);
6450
6451      if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6452        // We are done with this row, reset the continuation.
6453        joinedContinuationRow = null;
6454      }
6455      // As the data is obtained from two independent heaps, we need to
6456      // ensure that result list is sorted, because Result relies on that.
6457      sort(results, comparator);
6458      return moreValues;
6459    }
6460
6461    /**
6462     * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
6463     * reached, or remainingResultSize (if not -1) is reaced
6464     * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
6465     * @param scannerContext
6466     * @param currentRowCell
6467     * @return state of last call to {@link KeyValueHeap#next()}
6468     */
6469    private boolean populateResult(List<Cell> results, KeyValueHeap heap,
6470        ScannerContext scannerContext, Cell currentRowCell) throws IOException {
6471      Cell nextKv;
6472      boolean moreCellsInRow = false;
6473      boolean tmpKeepProgress = scannerContext.getKeepProgress();
6474      // Scanning between column families and thus the scope is between cells
6475      LimitScope limitScope = LimitScope.BETWEEN_CELLS;
6476      do {
6477        // We want to maintain any progress that is made towards the limits while scanning across
6478        // different column families. To do this, we toggle the keep progress flag on during calls
6479        // to the StoreScanner to ensure that any progress made thus far is not wiped away.
6480        scannerContext.setKeepProgress(true);
6481        heap.next(results, scannerContext);
6482        scannerContext.setKeepProgress(tmpKeepProgress);
6483
6484        nextKv = heap.peek();
6485        moreCellsInRow = moreCellsInRow(nextKv, currentRowCell);
6486        if (!moreCellsInRow) incrementCountOfRowsScannedMetric(scannerContext);
6487        if (moreCellsInRow && scannerContext.checkBatchLimit(limitScope)) {
6488          return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues();
6489        } else if (scannerContext.checkSizeLimit(limitScope)) {
6490          ScannerContext.NextState state =
6491              moreCellsInRow ? NextState.SIZE_LIMIT_REACHED_MID_ROW : NextState.SIZE_LIMIT_REACHED;
6492          return scannerContext.setScannerState(state).hasMoreValues();
6493        } else if (scannerContext.checkTimeLimit(limitScope)) {
6494          ScannerContext.NextState state =
6495              moreCellsInRow ? NextState.TIME_LIMIT_REACHED_MID_ROW : NextState.TIME_LIMIT_REACHED;
6496          return scannerContext.setScannerState(state).hasMoreValues();
6497        }
6498      } while (moreCellsInRow);
6499      return nextKv != null;
6500    }
6501
6502    /**
6503     * Based on the nextKv in the heap, and the current row, decide whether or not there are more
6504     * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
6505     * then there are more cells to be read in the row.
6506     * @param nextKv
6507     * @param currentRowCell
6508     * @return true When there are more cells in the row to be read
6509     */
6510    private boolean moreCellsInRow(final Cell nextKv, Cell currentRowCell) {
6511      return nextKv != null && CellUtil.matchingRows(nextKv, currentRowCell);
6512    }
6513
6514    /*
6515     * @return True if a filter rules the scanner is over, done.
6516     */
6517    @Override
6518    public synchronized boolean isFilterDone() throws IOException {
6519      return isFilterDoneInternal();
6520    }
6521
6522    private boolean isFilterDoneInternal() throws IOException {
6523      return this.filter != null && this.filter.filterAllRemaining();
6524    }
6525
6526    private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
6527        throws IOException {
6528      if (!results.isEmpty()) {
6529        throw new IllegalArgumentException("First parameter should be an empty list");
6530      }
6531      if (scannerContext == null) {
6532        throw new IllegalArgumentException("Scanner context cannot be null");
6533      }
6534      Optional<RpcCall> rpcCall = RpcServer.getCurrentCall();
6535
6536      // Save the initial progress from the Scanner context in these local variables. The progress
6537      // may need to be reset a few times if rows are being filtered out so we save the initial
6538      // progress.
6539      int initialBatchProgress = scannerContext.getBatchProgress();
6540      long initialSizeProgress = scannerContext.getDataSizeProgress();
6541      long initialHeapSizeProgress = scannerContext.getHeapSizeProgress();
6542
6543      // Used to check time limit
6544      LimitScope limitScope = LimitScope.BETWEEN_CELLS;
6545
6546      // The loop here is used only when at some point during the next we determine
6547      // that due to effects of filters or otherwise, we have an empty row in the result.
6548      // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
6549      // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
6550      // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
6551      while (true) {
6552        // Starting to scan a new row. Reset the scanner progress according to whether or not
6553        // progress should be kept.
6554        if (scannerContext.getKeepProgress()) {
6555          // Progress should be kept. Reset to initial values seen at start of method invocation.
6556          scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
6557              initialHeapSizeProgress);
6558        } else {
6559          scannerContext.clearProgress();
6560        }
6561        if (rpcCall.isPresent()) {
6562          // If a user specifies a too-restrictive or too-slow scanner, the
6563          // client might time out and disconnect while the server side
6564          // is still processing the request. We should abort aggressively
6565          // in that case.
6566          long afterTime = rpcCall.get().disconnectSince();
6567          if (afterTime >= 0) {
6568            throw new CallerDisconnectedException(
6569                "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " +
6570                    this + " after " + afterTime + " ms, since " +
6571                    "caller disconnected");
6572          }
6573        }
6574
6575        // Let's see what we have in the storeHeap.
6576        Cell current = this.storeHeap.peek();
6577
6578        boolean shouldStop = shouldStop(current);
6579        // When has filter row is true it means that the all the cells for a particular row must be
6580        // read before a filtering decision can be made. This means that filters where hasFilterRow
6581        // run the risk of enLongAddering out of memory errors in the case that they are applied to a
6582        // table that has very large rows.
6583        boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();
6584
6585        // If filter#hasFilterRow is true, partial results are not allowed since allowing them
6586        // would prevent the filters from being evaluated. Thus, if it is true, change the
6587        // scope of any limits that could potentially create partial results to
6588        // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row
6589        if (hasFilterRow) {
6590          if (LOG.isTraceEnabled()) {
6591            LOG.trace("filter#hasFilterRow is true which prevents partial results from being "
6592                + " formed. Changing scope of limits that may create partials");
6593          }
6594          scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS);
6595          scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS);
6596          limitScope = LimitScope.BETWEEN_ROWS;
6597        }
6598
6599        if (scannerContext.checkTimeLimit(LimitScope.BETWEEN_CELLS)) {
6600          if (hasFilterRow) {
6601            throw new IncompatibleFilterException(
6602                "Filter whose hasFilterRow() returns true is incompatible with scans that must " +
6603                    " stop mid-row because of a limit. ScannerContext:" + scannerContext);
6604          }
6605          return true;
6606        }
6607
6608        // Check if we were getting data from the joinedHeap and hit the limit.
6609        // If not, then it's main path - getting results from storeHeap.
6610        if (joinedContinuationRow == null) {
6611          // First, check if we are at a stop row. If so, there are no more results.
6612          if (shouldStop) {
6613            if (hasFilterRow) {
6614              filter.filterRowCells(results);
6615            }
6616            return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6617          }
6618
6619          // Check if rowkey filter wants to exclude this row. If so, loop to next.
6620          // Technically, if we hit limits before on this row, we don't need this call.
6621          if (filterRowKey(current)) {
6622            incrementCountOfRowsFilteredMetric(scannerContext);
6623            // early check, see HBASE-16296
6624            if (isFilterDoneInternal()) {
6625              return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6626            }
6627            // Typically the count of rows scanned is incremented inside #populateResult. However,
6628            // here we are filtering a row based purely on its row key, preventing us from calling
6629            // #populateResult. Thus, perform the necessary increment here to rows scanned metric
6630            incrementCountOfRowsScannedMetric(scannerContext);
6631            boolean moreRows = nextRow(scannerContext, current);
6632            if (!moreRows) {
6633              return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6634            }
6635            results.clear();
6636
6637            // Read nothing as the rowkey was filtered, but still need to check time limit
6638            if (scannerContext.checkTimeLimit(limitScope)) {
6639              return true;
6640            }
6641            continue;
6642          }
6643
6644          // Ok, we are good, let's try to get some results from the main heap.
6645          populateResult(results, this.storeHeap, scannerContext, current);
6646          if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6647            if (hasFilterRow) {
6648              throw new IncompatibleFilterException(
6649                  "Filter whose hasFilterRow() returns true is incompatible with scans that must "
6650                      + " stop mid-row because of a limit. ScannerContext:" + scannerContext);
6651            }
6652            return true;
6653          }
6654
6655          Cell nextKv = this.storeHeap.peek();
6656          shouldStop = shouldStop(nextKv);
6657          // save that the row was empty before filters applied to it.
6658          final boolean isEmptyRow = results.isEmpty();
6659
6660          // We have the part of the row necessary for filtering (all of it, usually).
6661          // First filter with the filterRow(List).
6662          FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
6663          if (hasFilterRow) {
6664            ret = filter.filterRowCellsWithRet(results);
6665
6666            // We don't know how the results have changed after being filtered. Must set progress
6667            // according to contents of results now.
6668            if (scannerContext.getKeepProgress()) {
6669              scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
6670                  initialHeapSizeProgress);
6671            } else {
6672              scannerContext.clearProgress();
6673            }
6674            scannerContext.incrementBatchProgress(results.size());
6675            for (Cell cell : results) {
6676              scannerContext.incrementSizeProgress(PrivateCellUtil.estimatedSerializedSizeOf(cell),
6677                PrivateCellUtil.estimatedSizeOfCell(cell));
6678            }
6679          }
6680
6681          if (isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE || filterRow()) {
6682            incrementCountOfRowsFilteredMetric(scannerContext);
6683            results.clear();
6684            boolean moreRows = nextRow(scannerContext, current);
6685            if (!moreRows) {
6686              return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6687            }
6688
6689            // This row was totally filtered out, if this is NOT the last row,
6690            // we should continue on. Otherwise, nothing else to do.
6691            if (!shouldStop) {
6692              // Read nothing as the cells was filtered, but still need to check time limit
6693              if (scannerContext.checkTimeLimit(limitScope)) {
6694                return true;
6695              }
6696              continue;
6697            }
6698            return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6699          }
6700
6701          // Ok, we are done with storeHeap for this row.
6702          // Now we may need to fetch additional, non-essential data into row.
6703          // These values are not needed for filter to work, so we postpone their
6704          // fetch to (possibly) reduce amount of data loads from disk.
6705          if (this.joinedHeap != null) {
6706            boolean mayHaveData = joinedHeapMayHaveData(current);
6707            if (mayHaveData) {
6708              joinedContinuationRow = current;
6709              populateFromJoinedHeap(results, scannerContext);
6710
6711              if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6712                return true;
6713              }
6714            }
6715          }
6716        } else {
6717          // Populating from the joined heap was stopped by limits, populate some more.
6718          populateFromJoinedHeap(results, scannerContext);
6719          if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6720            return true;
6721          }
6722        }
6723        // We may have just called populateFromJoinedMap and hit the limits. If that is
6724        // the case, we need to call it again on the next next() invocation.
6725        if (joinedContinuationRow != null) {
6726          return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6727        }
6728
6729        // Finally, we are done with both joinedHeap and storeHeap.
6730        // Double check to prevent empty rows from appearing in result. It could be
6731        // the case when SingleColumnValueExcludeFilter is used.
6732        if (results.isEmpty()) {
6733          incrementCountOfRowsFilteredMetric(scannerContext);
6734          boolean moreRows = nextRow(scannerContext, current);
6735          if (!moreRows) {
6736            return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6737          }
6738          if (!shouldStop) continue;
6739        }
6740
6741        if (shouldStop) {
6742          return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6743        } else {
6744          return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6745        }
6746      }
6747    }
6748
6749    protected void incrementCountOfRowsFilteredMetric(ScannerContext scannerContext) {
6750      filteredReadRequestsCount.increment();
6751
6752      if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
6753
6754      scannerContext.getMetrics().countOfRowsFiltered.incrementAndGet();
6755    }
6756
6757    protected void incrementCountOfRowsScannedMetric(ScannerContext scannerContext) {
6758      if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
6759
6760      scannerContext.getMetrics().countOfRowsScanned.incrementAndGet();
6761    }
6762
6763    /**
6764     * @param currentRowCell
6765     * @return true when the joined heap may have data for the current row
6766     * @throws IOException
6767     */
6768    private boolean joinedHeapMayHaveData(Cell currentRowCell)
6769        throws IOException {
6770      Cell nextJoinedKv = joinedHeap.peek();
6771      boolean matchCurrentRow =
6772          nextJoinedKv != null && CellUtil.matchingRows(nextJoinedKv, currentRowCell);
6773      boolean matchAfterSeek = false;
6774
6775      // If the next value in the joined heap does not match the current row, try to seek to the
6776      // correct row
6777      if (!matchCurrentRow) {
6778        Cell firstOnCurrentRow = PrivateCellUtil.createFirstOnRow(currentRowCell);
6779        boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true);
6780        matchAfterSeek =
6781            seekSuccessful && joinedHeap.peek() != null
6782                && CellUtil.matchingRows(joinedHeap.peek(), currentRowCell);
6783      }
6784
6785      return matchCurrentRow || matchAfterSeek;
6786    }
6787
6788    /**
6789     * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
6790     * both filterRow & filterRow({@code List<KeyValue> kvs}) functions. While 0.94 code or older,
6791     * it may not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only
6792     * returns true when filterRow({@code List<KeyValue> kvs}) is overridden not the filterRow().
6793     * Therefore, the filterRow() will be skipped.
6794     */
6795    private boolean filterRow() throws IOException {
6796      // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
6797      // filterRowCells(List<Cell> kvs) so we skip that scenario here.
6798      return filter != null && (!filter.hasFilterRow())
6799          && filter.filterRow();
6800    }
6801
6802    private boolean filterRowKey(Cell current) throws IOException {
6803      return filter != null && filter.filterRowKey(current);
6804    }
6805
6806    protected boolean nextRow(ScannerContext scannerContext, Cell curRowCell) throws IOException {
6807      assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
6808      Cell next;
6809      while ((next = this.storeHeap.peek()) != null &&
6810             CellUtil.matchingRows(next, curRowCell)) {
6811        this.storeHeap.next(MOCKED_LIST);
6812      }
6813      resetFilters();
6814
6815      // Calling the hook in CP which allows it to do a fast forward
6816      return this.region.getCoprocessorHost() == null
6817          || this.region.getCoprocessorHost()
6818              .postScannerFilterRow(this, curRowCell);
6819    }
6820
6821    protected boolean shouldStop(Cell currentRowCell) {
6822      if (currentRowCell == null) {
6823        return true;
6824      }
6825      if (stopRow == null || Bytes.equals(stopRow, HConstants.EMPTY_END_ROW)) {
6826        return false;
6827      }
6828      int c = comparator.compareRows(currentRowCell, stopRow, 0, stopRow.length);
6829      return c > 0 || (c == 0 && !includeStopRow);
6830    }
6831
6832    @Override
6833    public synchronized void close() {
6834      if (storeHeap != null) {
6835        storeHeap.close();
6836        storeHeap = null;
6837      }
6838      if (joinedHeap != null) {
6839        joinedHeap.close();
6840        joinedHeap = null;
6841      }
6842      // no need to synchronize here.
6843      scannerReadPoints.remove(this);
6844      this.filterClosed = true;
6845    }
6846
6847    KeyValueHeap getStoreHeapForTesting() {
6848      return storeHeap;
6849    }
6850
6851    @Override
6852    public synchronized boolean reseek(byte[] row) throws IOException {
6853      if (row == null) {
6854        throw new IllegalArgumentException("Row cannot be null.");
6855      }
6856      boolean result = false;
6857      startRegionOperation();
6858      Cell kv = PrivateCellUtil.createFirstOnRow(row, 0, (short) row.length);
6859      try {
6860        // use request seek to make use of the lazy seek option. See HBASE-5520
6861        result = this.storeHeap.requestSeek(kv, true, true);
6862        if (this.joinedHeap != null) {
6863          result = this.joinedHeap.requestSeek(kv, true, true) || result;
6864        }
6865      } finally {
6866        closeRegionOperation();
6867      }
6868      return result;
6869    }
6870
6871    @Override
6872    public void shipped() throws IOException {
6873      if (storeHeap != null) {
6874        storeHeap.shipped();
6875      }
6876      if (joinedHeap != null) {
6877        joinedHeap.shipped();
6878      }
6879    }
6880
6881    @Override
6882    public void run() throws IOException {
6883      // This is the RPC callback method executed. We do the close in of the scanner in this
6884      // callback
6885      this.close();
6886    }
6887  }
6888
6889  // Utility methods
6890  /**
6891   * A utility method to create new instances of HRegion based on the
6892   * {@link HConstants#REGION_IMPL} configuration property.
6893   * @param tableDir qualified path of directory where region should be located,
6894   * usually the table directory.
6895   * @param wal The WAL is the outbound log for any updates to the HRegion
6896   * The wal file is a logfile from the previous execution that's
6897   * custom-computed for this HRegion. The HRegionServer computes and sorts the
6898   * appropriate wal info for this HRegion. If there is a previous file
6899   * (implying that the HRegion has been written-to before), then read it from
6900   * the supplied path.
6901   * @param fs is the filesystem.
6902   * @param conf is global configuration settings.
6903   * @param regionInfo - RegionInfo that describes the region
6904   * is new), then read them from the supplied path.
6905   * @param htd the table descriptor
6906   * @return the new instance
6907   */
6908  static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
6909      Configuration conf, RegionInfo regionInfo, final TableDescriptor htd,
6910      RegionServerServices rsServices) {
6911    try {
6912      @SuppressWarnings("unchecked")
6913      Class<? extends HRegion> regionClass =
6914          (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
6915
6916      Constructor<? extends HRegion> c =
6917          regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
6918              Configuration.class, RegionInfo.class, TableDescriptor.class,
6919              RegionServerServices.class);
6920
6921      return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
6922    } catch (Throwable e) {
6923      // todo: what should I throw here?
6924      throw new IllegalStateException("Could not instantiate a region instance.", e);
6925    }
6926  }
6927
6928  /**
6929   * Convenience method creating new HRegions. Used by createTable.
6930   *
6931   * @param info Info for region to create.
6932   * @param rootDir Root directory for HBase instance
6933   * @param wal shared WAL
6934   * @param initialize - true to initialize the region
6935   * @return new HRegion
6936   * @throws IOException
6937   */
6938  public static HRegion createHRegion(final RegionInfo info, final Path rootDir,
6939        final Configuration conf, final TableDescriptor hTableDescriptor,
6940        final WAL wal, final boolean initialize)
6941  throws IOException {
6942    LOG.info("creating " + info
6943        + ", tableDescriptor=" + (hTableDescriptor == null? "null": hTableDescriptor) +
6944        ", regionDir=" + rootDir);
6945    createRegionDir(conf, info, rootDir);
6946    FileSystem fs = rootDir.getFileSystem(conf);
6947    Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6948    HRegion region = HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, null);
6949    if (initialize) region.initialize(null);
6950    return region;
6951  }
6952
6953  /**
6954   * Create the region directory in the filesystem.
6955   */
6956  public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri,
6957        Path rootDir)
6958      throws IOException {
6959    FileSystem fs = rootDir.getFileSystem(configuration);
6960    Path tableDir = FSUtils.getTableDir(rootDir, ri.getTable());
6961    // If directory already exists, will log warning and keep going. Will try to create
6962    // .regioninfo. If one exists, will overwrite.
6963    return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri);
6964  }
6965
6966  public static HRegion createHRegion(final RegionInfo info, final Path rootDir,
6967                                      final Configuration conf,
6968                                      final TableDescriptor hTableDescriptor,
6969                                      final WAL wal)
6970    throws IOException {
6971    return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
6972  }
6973
6974
6975  /**
6976   * Open a Region.
6977   * @param info Info for region to be opened.
6978   * @param wal WAL for region to use. This method will call
6979   * WAL#setSequenceNumber(long) passing the result of the call to
6980   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6981   * up.  HRegionStore does this every time it opens a new region.
6982   * @return new HRegion
6983   *
6984   * @throws IOException
6985   */
6986  public static HRegion openHRegion(final RegionInfo info,
6987      final TableDescriptor htd, final WAL wal,
6988      final Configuration conf)
6989  throws IOException {
6990    return openHRegion(info, htd, wal, conf, null, null);
6991  }
6992
6993  /**
6994   * Open a Region.
6995   * @param info Info for region to be opened
6996   * @param htd the table descriptor
6997   * @param wal WAL for region to use. This method will call
6998   * WAL#setSequenceNumber(long) passing the result of the call to
6999   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
7000   * up.  HRegionStore does this every time it opens a new region.
7001   * @param conf The Configuration object to use.
7002   * @param rsServices An interface we can request flushes against.
7003   * @param reporter An interface we can report progress against.
7004   * @return new HRegion
7005   *
7006   * @throws IOException
7007   */
7008  public static HRegion openHRegion(final RegionInfo info,
7009    final TableDescriptor htd, final WAL wal, final Configuration conf,
7010    final RegionServerServices rsServices,
7011    final CancelableProgressable reporter)
7012  throws IOException {
7013    return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
7014  }
7015
7016  /**
7017   * Open a Region.
7018   * @param rootDir Root directory for HBase instance
7019   * @param info Info for region to be opened.
7020   * @param htd the table descriptor
7021   * @param wal WAL for region to use. This method will call
7022   * WAL#setSequenceNumber(long) passing the result of the call to
7023   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
7024   * up.  HRegionStore does this every time it opens a new region.
7025   * @param conf The Configuration object to use.
7026   * @return new HRegion
7027   * @throws IOException
7028   */
7029  public static HRegion openHRegion(Path rootDir, final RegionInfo info,
7030      final TableDescriptor htd, final WAL wal, final Configuration conf)
7031  throws IOException {
7032    return openHRegion(rootDir, info, htd, wal, conf, null, null);
7033  }
7034
7035  /**
7036   * Open a Region.
7037   * @param rootDir Root directory for HBase instance
7038   * @param info Info for region to be opened.
7039   * @param htd the table descriptor
7040   * @param wal WAL for region to use. This method will call
7041   * WAL#setSequenceNumber(long) passing the result of the call to
7042   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
7043   * up.  HRegionStore does this every time it opens a new region.
7044   * @param conf The Configuration object to use.
7045   * @param rsServices An interface we can request flushes against.
7046   * @param reporter An interface we can report progress against.
7047   * @return new HRegion
7048   * @throws IOException
7049   */
7050  public static HRegion openHRegion(final Path rootDir, final RegionInfo info,
7051      final TableDescriptor htd, final WAL wal, final Configuration conf,
7052      final RegionServerServices rsServices,
7053      final CancelableProgressable reporter)
7054  throws IOException {
7055    FileSystem fs = null;
7056    if (rsServices != null) {
7057      fs = rsServices.getFileSystem();
7058    }
7059    if (fs == null) {
7060      fs = rootDir.getFileSystem(conf);
7061    }
7062    return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
7063  }
7064
7065  /**
7066   * Open a Region.
7067   * @param conf The Configuration object to use.
7068   * @param fs Filesystem to use
7069   * @param rootDir Root directory for HBase instance
7070   * @param info Info for region to be opened.
7071   * @param htd the table descriptor
7072   * @param wal WAL for region to use. This method will call
7073   * WAL#setSequenceNumber(long) passing the result of the call to
7074   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
7075   * up.  HRegionStore does this every time it opens a new region.
7076   * @return new HRegion
7077   */
7078  public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
7079      final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal)
7080      throws IOException {
7081    return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
7082  }
7083
7084  /**
7085   * Open a Region.
7086   * @param conf The Configuration object to use.
7087   * @param fs Filesystem to use
7088   * @param rootDir Root directory for HBase instance
7089   * @param info Info for region to be opened.
7090   * @param htd the table descriptor
7091   * @param wal WAL for region to use. This method will call
7092   * WAL#setSequenceNumber(long) passing the result of the call to
7093   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
7094   * up.  HRegionStore does this every time it opens a new region.
7095   * @param rsServices An interface we can request flushes against.
7096   * @param reporter An interface we can report progress against.
7097   * @return new HRegion
7098   */
7099  public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
7100      final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal,
7101      final RegionServerServices rsServices, final CancelableProgressable reporter)
7102      throws IOException {
7103    Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
7104    return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
7105  }
7106
7107  /**
7108   * Open a Region.
7109   * @param conf The Configuration object to use.
7110   * @param fs Filesystem to use
7111   * @param rootDir Root directory for HBase instance
7112   * @param info Info for region to be opened.
7113   * @param htd the table descriptor
7114   * @param wal WAL for region to use. This method will call
7115   * WAL#setSequenceNumber(long) passing the result of the call to
7116   * HRegion#getMinSequenceId() to ensure the wal id is properly kept
7117   * up.  HRegionStore does this every time it opens a new region.
7118   * @param rsServices An interface we can request flushes against.
7119   * @param reporter An interface we can report progress against.
7120   * @return new HRegion
7121   */
7122  public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
7123      final Path rootDir, final Path tableDir, final RegionInfo info, final TableDescriptor htd,
7124      final WAL wal, final RegionServerServices rsServices,
7125      final CancelableProgressable reporter)
7126      throws IOException {
7127    if (info == null) throw new NullPointerException("Passed region info is null");
7128    if (LOG.isDebugEnabled()) {
7129      LOG.debug("Opening region: " + info);
7130    }
7131    HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
7132    return r.openHRegion(reporter);
7133  }
7134
7135  @VisibleForTesting
7136  public NavigableMap<byte[], Integer> getReplicationScope() {
7137    return this.replicationScope;
7138  }
7139
7140  /**
7141   * Useful when reopening a closed region (normally for unit tests)
7142   * @param other original object
7143   * @param reporter An interface we can report progress against.
7144   * @return new HRegion
7145   */
7146  public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
7147      throws IOException {
7148    HRegionFileSystem regionFs = other.getRegionFileSystem();
7149    HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
7150        other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null);
7151    return r.openHRegion(reporter);
7152  }
7153
7154  public static Region openHRegion(final Region other, final CancelableProgressable reporter)
7155        throws IOException {
7156    return openHRegion((HRegion)other, reporter);
7157  }
7158
7159  /**
7160   * Open HRegion.
7161   * Calls initialize and sets sequenceId.
7162   * @return Returns <code>this</code>
7163   */
7164  protected HRegion openHRegion(final CancelableProgressable reporter)
7165  throws IOException {
7166    // Refuse to open the region if we are missing local compression support
7167    checkCompressionCodecs();
7168    // Refuse to open the region if encryption configuration is incorrect or
7169    // codec support is missing
7170    checkEncryption();
7171    // Refuse to open the region if a required class cannot be loaded
7172    checkClassLoading();
7173    this.openSeqNum = initialize(reporter);
7174    this.mvcc.advanceTo(openSeqNum);
7175    // The openSeqNum must be increased every time when a region is assigned, as we rely on it to
7176    // determine whether a region has been successfully reopened. So here we always write open
7177    // marker, even if the table is read only.
7178    if (wal != null && getRegionServerServices() != null &&
7179      RegionReplicaUtil.isDefaultReplica(getRegionInfo())) {
7180      writeRegionOpenMarker(wal, openSeqNum);
7181    }
7182    return this;
7183  }
7184
7185  /**
7186   * Open a Region on a read-only file-system (like hdfs snapshots)
7187   * @param conf The Configuration object to use.
7188   * @param fs Filesystem to use
7189   * @param info Info for region to be opened.
7190   * @param htd the table descriptor
7191   * @return new HRegion
7192   */
7193  public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs,
7194      final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException {
7195    if (info == null) {
7196      throw new NullPointerException("Passed region info is null");
7197    }
7198    if (LOG.isDebugEnabled()) {
7199      LOG.debug("Opening region (readOnly filesystem): " + info);
7200    }
7201    if (info.getReplicaId() <= 0) {
7202      info = RegionInfoBuilder.newBuilder(info).setReplicaId(1).build();
7203    }
7204    HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null);
7205    r.writestate.setReadOnly(true);
7206    return r.openHRegion(null);
7207  }
7208
7209  public static void warmupHRegion(final RegionInfo info,
7210      final TableDescriptor htd, final WAL wal, final Configuration conf,
7211      final RegionServerServices rsServices,
7212      final CancelableProgressable reporter)
7213      throws IOException {
7214
7215    if (info == null) throw new NullPointerException("Passed region info is null");
7216
7217    if (LOG.isDebugEnabled()) {
7218      LOG.debug("HRegion.Warming up region: " + info);
7219    }
7220
7221    Path rootDir = FSUtils.getRootDir(conf);
7222    Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
7223
7224    FileSystem fs = null;
7225    if (rsServices != null) {
7226      fs = rsServices.getFileSystem();
7227    }
7228    if (fs == null) {
7229      fs = rootDir.getFileSystem(conf);
7230    }
7231
7232    HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null);
7233    r.initializeWarmup(reporter);
7234  }
7235
7236
7237  private void checkCompressionCodecs() throws IOException {
7238    for (ColumnFamilyDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
7239      CompressionTest.testCompression(fam.getCompressionType());
7240      CompressionTest.testCompression(fam.getCompactionCompressionType());
7241    }
7242  }
7243
7244  private void checkEncryption() throws IOException {
7245    for (ColumnFamilyDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
7246      EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
7247    }
7248  }
7249
7250  private void checkClassLoading() throws IOException {
7251    RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
7252    RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
7253  }
7254
7255  /**
7256   * Computes the Path of the HRegion
7257   *
7258   * @param tabledir qualified path for table
7259   * @param name ENCODED region name
7260   * @return Path of HRegion directory
7261   * @deprecated For tests only; to be removed.
7262   */
7263  @Deprecated
7264  public static Path getRegionDir(final Path tabledir, final String name) {
7265    return new Path(tabledir, name);
7266  }
7267
7268  /**
7269   * Determines if the specified row is within the row range specified by the
7270   * specified RegionInfo
7271   *
7272   * @param info RegionInfo that specifies the row range
7273   * @param row row to be checked
7274   * @return true if the row is within the range specified by the RegionInfo
7275   */
7276  public static boolean rowIsInRange(RegionInfo info, final byte [] row) {
7277    return ((info.getStartKey().length == 0) ||
7278        (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
7279        ((info.getEndKey().length == 0) ||
7280            (Bytes.compareTo(info.getEndKey(), row) > 0));
7281  }
7282
7283  public static boolean rowIsInRange(RegionInfo info, final byte [] row, final int offset,
7284      final short length) {
7285    return ((info.getStartKey().length == 0) ||
7286        (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length,
7287          row, offset, length) <= 0)) &&
7288        ((info.getEndKey().length == 0) ||
7289          (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0));
7290  }
7291
7292  @Override
7293  public Result get(final Get get) throws IOException {
7294    prepareGet(get);
7295    List<Cell> results = get(get, true);
7296    boolean stale = this.getRegionInfo().getReplicaId() != 0;
7297    return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
7298  }
7299
7300  void prepareGet(final Get get) throws IOException {
7301    checkRow(get.getRow(), "Get");
7302    // Verify families are all valid
7303    if (get.hasFamilies()) {
7304      for (byte[] family : get.familySet()) {
7305        checkFamily(family);
7306      }
7307    } else { // Adding all families to scanner
7308      for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) {
7309        get.addFamily(family);
7310      }
7311    }
7312  }
7313
7314  @Override
7315  public List<Cell> get(Get get, boolean withCoprocessor) throws IOException {
7316    return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE);
7317  }
7318
7319  public List<Cell> get(Get get, boolean withCoprocessor, long nonceGroup, long nonce)
7320      throws IOException {
7321    List<Cell> results = new ArrayList<>();
7322    long before =  EnvironmentEdgeManager.currentTime();
7323
7324    // pre-get CP hook
7325    if (withCoprocessor && (coprocessorHost != null)) {
7326      if (coprocessorHost.preGet(get, results)) {
7327        metricsUpdateForGet(results, before);
7328        return results;
7329      }
7330    }
7331    Scan scan = new Scan(get);
7332    if (scan.getLoadColumnFamiliesOnDemandValue() == null) {
7333      scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault());
7334    }
7335    RegionScanner scanner = null;
7336    try {
7337      scanner = getScanner(scan, null, nonceGroup, nonce);
7338      scanner.next(results);
7339    } finally {
7340      if (scanner != null)
7341        scanner.close();
7342    }
7343
7344    // post-get CP hook
7345    if (withCoprocessor && (coprocessorHost != null)) {
7346      coprocessorHost.postGet(get, results);
7347    }
7348
7349    metricsUpdateForGet(results, before);
7350
7351    return results;
7352  }
7353
7354  void metricsUpdateForGet(List<Cell> results, long before) {
7355    if (this.metricsRegion != null) {
7356      this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before);
7357    }
7358  }
7359
7360  @Override
7361  public void mutateRow(RowMutations rm) throws IOException {
7362    // Don't need nonces here - RowMutations only supports puts and deletes
7363    final List<Mutation> m = rm.getMutations();
7364    batchMutate(m.toArray(new Mutation[m.size()]), true, HConstants.NO_NONCE,
7365        HConstants.NO_NONCE);
7366  }
7367
7368  /**
7369   * Perform atomic (all or none) mutations within the region.
7370   * @param mutations The list of mutations to perform.
7371   * <code>mutations</code> can contain operations for multiple rows.
7372   * Caller has to ensure that all rows are contained in this region.
7373   * @param rowsToLock Rows to lock
7374   * @param nonceGroup Optional nonce group of the operation (client Id)
7375   * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
7376   * If multiple rows are locked care should be taken that
7377   * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
7378   * @throws IOException
7379   */
7380  @Override
7381  public void mutateRowsWithLocks(Collection<Mutation> mutations,
7382      Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
7383    batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]),
7384        true, nonceGroup, nonce) {
7385      @Override
7386      public MiniBatchOperationInProgress<Mutation> lockRowsAndBuildMiniBatch(
7387          List<RowLock> acquiredRowLocks) throws IOException {
7388        RowLock prevRowLock = null;
7389        for (byte[] row : rowsToLock) {
7390          try {
7391            RowLock rowLock = region.getRowLockInternal(row, false, prevRowLock); // write lock
7392            if (rowLock != prevRowLock) {
7393              acquiredRowLocks.add(rowLock);
7394              prevRowLock = rowLock;
7395            }
7396          } catch (IOException ioe) {
7397            LOG.warn("Failed getting lock, row=" + Bytes.toStringBinary(row), ioe);
7398            throw ioe;
7399          }
7400        }
7401        return createMiniBatch(size(), size());
7402      }
7403    });
7404  }
7405
7406  /**
7407   * @return statistics about the current load of the region
7408   */
7409  public ClientProtos.RegionLoadStats getLoadStatistics() {
7410    if (!regionStatsEnabled) {
7411      return null;
7412    }
7413    ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
7414    stats.setMemStoreLoad((int) (Math.min(100,
7415        (this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize)));
7416    if (rsServices.getHeapMemoryManager() != null) {
7417      // the HeapMemoryManager uses -0.0 to signal a problem asking the JVM,
7418      // so we could just do the calculation below and we'll get a 0.
7419      // treating it as a special case analogous to no HMM instead so that it can be
7420      // programatically treated different from using <1% of heap.
7421      final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent();
7422      if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) {
7423        stats.setHeapOccupancy((int)(occupancy * 100));
7424      }
7425    }
7426    stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100 ? 100
7427        : rsServices.getCompactionPressure() * 100));
7428    return stats.build();
7429  }
7430
7431  @Override
7432  public void processRowsWithLocks(RowProcessor<?,?> processor) throws IOException {
7433    processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE, HConstants.NO_NONCE);
7434  }
7435
7436  @Override
7437  public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
7438      throws IOException {
7439    processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
7440  }
7441
7442  @Override
7443  public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
7444      long nonceGroup, long nonce) throws IOException {
7445    for (byte[] row : processor.getRowsToLock()) {
7446      checkRow(row, "processRowsWithLocks");
7447    }
7448    if (!processor.readOnly()) {
7449      checkReadOnly();
7450    }
7451    checkResources();
7452    startRegionOperation();
7453    WALEdit walEdit = new WALEdit();
7454
7455    // STEP 1. Run pre-process hook
7456    preProcess(processor, walEdit);
7457    // Short circuit the read only case
7458    if (processor.readOnly()) {
7459      try {
7460        long now = EnvironmentEdgeManager.currentTime();
7461        doProcessRowWithTimeout(processor, now, this, null, null, timeout);
7462        processor.postProcess(this, walEdit, true);
7463      } finally {
7464        closeRegionOperation();
7465      }
7466      return;
7467    }
7468
7469    boolean locked = false;
7470    List<RowLock> acquiredRowLocks = null;
7471    List<Mutation> mutations = new ArrayList<>();
7472    Collection<byte[]> rowsToLock = processor.getRowsToLock();
7473    // This is assigned by mvcc either explicity in the below or in the guts of the WAL append
7474    // when it assigns the edit a sequencedid (A.K.A the mvcc write number).
7475    WriteEntry writeEntry = null;
7476    MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing();
7477    try {
7478      boolean success = false;
7479      try {
7480        // STEP 2. Acquire the row lock(s)
7481        acquiredRowLocks = new ArrayList<>(rowsToLock.size());
7482        RowLock prevRowLock = null;
7483        for (byte[] row : rowsToLock) {
7484          // Attempt to lock all involved rows, throw if any lock times out
7485          // use a writer lock for mixed reads and writes
7486          RowLock rowLock = getRowLockInternal(row, false, prevRowLock);
7487          if (rowLock != prevRowLock) {
7488            acquiredRowLocks.add(rowLock);
7489            prevRowLock = rowLock;
7490          }
7491        }
7492        // STEP 3. Region lock
7493        lock(this.updatesLock.readLock(), acquiredRowLocks.isEmpty() ? 1 : acquiredRowLocks.size());
7494        locked = true;
7495        long now = EnvironmentEdgeManager.currentTime();
7496        // STEP 4. Let the processor scan the rows, generate mutations and add waledits
7497        doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout);
7498        if (!mutations.isEmpty()) {
7499          writeRequestsCount.add(mutations.size());
7500          // STEP 5. Call the preBatchMutate hook
7501          processor.preBatchMutate(this, walEdit);
7502
7503          // STEP 6. Append and sync if walEdit has data to write out.
7504          if (!walEdit.isEmpty()) {
7505            writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()),
7506                processor.getClusterIds(), now, nonceGroup, nonce);
7507          } else {
7508            // We are here if WAL is being skipped.
7509            writeEntry = this.mvcc.begin();
7510          }
7511
7512          // STEP 7. Apply to memstore
7513          long sequenceId = writeEntry.getWriteNumber();
7514          for (Mutation m : mutations) {
7515            // Handle any tag based cell features.
7516            // TODO: Do we need to call rewriteCellTags down in applyToMemStore()? Why not before
7517            // so tags go into WAL?
7518            rewriteCellTags(m.getFamilyCellMap(), m);
7519            for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
7520              Cell cell = cellScanner.current();
7521              if (walEdit.isEmpty()) {
7522                // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id.
7523                // If no WAL, need to stamp it here.
7524                PrivateCellUtil.setSequenceId(cell, sequenceId);
7525              }
7526              applyToMemStore(getStore(cell), cell, memstoreAccounting);
7527            }
7528          }
7529
7530          // STEP 8. call postBatchMutate hook
7531          processor.postBatchMutate(this);
7532
7533          // STEP 9. Complete mvcc.
7534          mvcc.completeAndWait(writeEntry);
7535          writeEntry = null;
7536
7537          // STEP 10. Release region lock
7538          if (locked) {
7539            this.updatesLock.readLock().unlock();
7540            locked = false;
7541          }
7542
7543          // STEP 11. Release row lock(s)
7544          releaseRowLocks(acquiredRowLocks);
7545        }
7546        success = true;
7547      } finally {
7548        // Call complete rather than completeAndWait because we probably had error if walKey != null
7549        if (writeEntry != null) mvcc.complete(writeEntry);
7550        if (locked) {
7551          this.updatesLock.readLock().unlock();
7552        }
7553        // release locks if some were acquired but another timed out
7554        releaseRowLocks(acquiredRowLocks);
7555      }
7556
7557      // 12. Run post-process hook
7558      processor.postProcess(this, walEdit, success);
7559    } finally {
7560      closeRegionOperation();
7561      if (!mutations.isEmpty()) {
7562        this.incMemStoreSize(memstoreAccounting.getMemStoreSize());
7563        requestFlushIfNeeded();
7564      }
7565    }
7566  }
7567
7568  private void preProcess(final RowProcessor<?,?> processor, final WALEdit walEdit)
7569  throws IOException {
7570    try {
7571      processor.preProcess(this, walEdit);
7572    } catch (IOException e) {
7573      closeRegionOperation();
7574      throw e;
7575    }
7576  }
7577
7578  private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
7579                                       final long now,
7580                                       final HRegion region,
7581                                       final List<Mutation> mutations,
7582                                       final WALEdit walEdit,
7583                                       final long timeout) throws IOException {
7584    // Short circuit the no time bound case.
7585    if (timeout < 0) {
7586      try {
7587        processor.process(now, region, mutations, walEdit);
7588      } catch (IOException e) {
7589        String row = processor.getRowsToLock().isEmpty() ? "" :
7590          " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
7591        LOG.warn("RowProcessor:" + processor.getClass().getName() +
7592            " throws Exception" + row, e);
7593        throw e;
7594      }
7595      return;
7596    }
7597
7598    // Case with time bound
7599    FutureTask<Void> task = new FutureTask<>(new Callable<Void>() {
7600        @Override
7601        public Void call() throws IOException {
7602          try {
7603            processor.process(now, region, mutations, walEdit);
7604            return null;
7605          } catch (IOException e) {
7606            String row = processor.getRowsToLock().isEmpty() ? "" :
7607              " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
7608            LOG.warn("RowProcessor:" + processor.getClass().getName() +
7609                " throws Exception" + row, e);
7610            throw e;
7611          }
7612        }
7613      });
7614    rowProcessorExecutor.execute(task);
7615    try {
7616      task.get(timeout, TimeUnit.MILLISECONDS);
7617    } catch (TimeoutException te) {
7618      String row = processor.getRowsToLock().isEmpty() ? "" :
7619        " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
7620      LOG.error("RowProcessor timeout:" + timeout + " ms" + row);
7621      throw new IOException(te);
7622    } catch (Exception e) {
7623      throw new IOException(e);
7624    }
7625  }
7626
7627  @Override
7628  public Result append(Append append) throws IOException {
7629    return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
7630  }
7631
7632  public Result append(Append mutation, long nonceGroup, long nonce) throws IOException {
7633    return doDelta(Operation.APPEND, mutation, nonceGroup, nonce, mutation.isReturnResults());
7634  }
7635
7636  @Override
7637  public Result increment(Increment increment) throws IOException {
7638    return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
7639  }
7640
7641  public Result increment(Increment mutation, long nonceGroup, long nonce) throws IOException {
7642    return doDelta(Operation.INCREMENT, mutation, nonceGroup, nonce, mutation.isReturnResults());
7643  }
7644
7645  /**
7646   * Add "deltas" to Cells. Deltas are increments or appends. Switch on <code>op</code>.
7647   *
7648   * <p>If increment, add deltas to current values or if an append, then
7649   * append the deltas to the current Cell values.
7650   *
7651   * <p>Append and Increment code paths are mostly the same. They differ in just a few places.
7652   * This method does the code path for increment and append and then in key spots, switches
7653   * on the passed in <code>op</code> to do increment or append specific paths.
7654   */
7655  private Result doDelta(Operation op, Mutation mutation, long nonceGroup, long nonce,
7656      boolean returnResults) throws IOException {
7657    checkReadOnly();
7658    checkResources();
7659    checkRow(mutation.getRow(), op.toString());
7660    checkFamilies(mutation.getFamilyCellMap().keySet());
7661    this.writeRequestsCount.increment();
7662    WriteEntry writeEntry = null;
7663    startRegionOperation(op);
7664    List<Cell> results = returnResults? new ArrayList<>(mutation.size()): null;
7665    RowLock rowLock = null;
7666    MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing();
7667    try {
7668      rowLock = getRowLockInternal(mutation.getRow(), false, null);
7669      lock(this.updatesLock.readLock());
7670      try {
7671        Result cpResult = doCoprocessorPreCall(op, mutation);
7672        if (cpResult != null) {
7673          // Metrics updated below in the finally block.
7674          return returnResults? cpResult: null;
7675        }
7676        Durability effectiveDurability = getEffectiveDurability(mutation.getDurability());
7677        Map<HStore, List<Cell>> forMemStore = new HashMap<>(mutation.getFamilyCellMap().size());
7678        // Reckon Cells to apply to WAL --  in returned walEdit -- and what to add to memstore and
7679        // what to return back to the client (in 'forMemStore' and 'results' respectively).
7680        WALEdit walEdit = reckonDeltas(op, mutation, effectiveDurability, forMemStore, results);
7681        // Actually write to WAL now if a walEdit to apply.
7682        if (walEdit != null && !walEdit.isEmpty()) {
7683          writeEntry = doWALAppend(walEdit, effectiveDurability, nonceGroup, nonce);
7684        } else {
7685          // If walEdits is empty, it means we skipped the WAL; update LongAdders and start an mvcc
7686          // transaction.
7687          recordMutationWithoutWal(mutation.getFamilyCellMap());
7688          writeEntry = mvcc.begin();
7689          updateSequenceId(forMemStore.values(), writeEntry.getWriteNumber());
7690        }
7691        // Now write to MemStore. Do it a column family at a time.
7692        for (Map.Entry<HStore, List<Cell>> e : forMemStore.entrySet()) {
7693          applyToMemStore(e.getKey(), e.getValue(), true, memstoreAccounting);
7694        }
7695        mvcc.completeAndWait(writeEntry);
7696        if (rsServices != null && rsServices.getNonceManager() != null) {
7697          rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce,
7698            writeEntry.getWriteNumber());
7699        }
7700        writeEntry = null;
7701      } finally {
7702        this.updatesLock.readLock().unlock();
7703      }
7704      // If results is null, then client asked that we not return the calculated results.
7705      return results != null && returnResults? Result.create(results): Result.EMPTY_RESULT;
7706    } finally {
7707      // Call complete always, even on success. doDelta is doing a Get READ_UNCOMMITTED when it goes
7708      // to get current value under an exclusive lock so no need so no need to wait to return to
7709      // the client. Means only way to read-your-own-increment or append is to come in with an
7710      // a 0 increment.
7711      if (writeEntry != null) mvcc.complete(writeEntry);
7712      if (rowLock != null) {
7713        rowLock.release();
7714      }
7715      // Request a cache flush if over the limit.  Do it outside update lock.
7716      incMemStoreSize(memstoreAccounting.getMemStoreSize());
7717      requestFlushIfNeeded();
7718      closeRegionOperation(op);
7719      if (this.metricsRegion != null) {
7720        switch (op) {
7721          case INCREMENT:
7722            this.metricsRegion.updateIncrement();
7723            break;
7724          case APPEND:
7725            this.metricsRegion.updateAppend();
7726            break;
7727          default:
7728            break;
7729        }
7730      }
7731    }
7732  }
7733
7734  private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, long nonceGroup,
7735      long nonce)
7736  throws IOException {
7737    return doWALAppend(walEdit, durability, WALKey.EMPTY_UUIDS, System.currentTimeMillis(),
7738      nonceGroup, nonce);
7739  }
7740
7741  private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds,
7742      long now, long nonceGroup, long nonce) throws IOException {
7743    return doWALAppend(walEdit, durability, clusterIds, now, nonceGroup, nonce,
7744        SequenceId.NO_SEQUENCE_ID);
7745  }
7746
7747  /**
7748   * @return writeEntry associated with this append
7749   */
7750  private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds,
7751      long now, long nonceGroup, long nonce, long origLogSeqNum) throws IOException {
7752    Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(),
7753        "WALEdit is null or empty!");
7754    Preconditions.checkArgument(!walEdit.isReplay() || origLogSeqNum != SequenceId.NO_SEQUENCE_ID,
7755        "Invalid replay sequence Id for replay WALEdit!");
7756    // Using default cluster id, as this can only happen in the originating cluster.
7757    // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey
7758    // here instead of WALKeyImpl directly to support legacy coprocessors.
7759    WALKeyImpl walKey = walEdit.isReplay()?
7760        new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(),
7761          this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds,
7762            nonceGroup, nonce, mvcc) :
7763        new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(),
7764            this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds,
7765            nonceGroup, nonce, mvcc, this.getReplicationScope());
7766    if (walEdit.isReplay()) {
7767      walKey.setOrigLogSeqNum(origLogSeqNum);
7768    }
7769    WriteEntry writeEntry = null;
7770    try {
7771      long txid = this.wal.append(this.getRegionInfo(), walKey, walEdit, true);
7772      // Call sync on our edit.
7773      if (txid != 0) {
7774        sync(txid, durability);
7775      }
7776      writeEntry = walKey.getWriteEntry();
7777    } catch (IOException ioe) {
7778      if (walKey != null && walKey.getWriteEntry() != null) {
7779        mvcc.complete(walKey.getWriteEntry());
7780      }
7781      throw ioe;
7782    }
7783    return writeEntry;
7784  }
7785
7786  /**
7787   * Do coprocessor pre-increment or pre-append call.
7788   * @return Result returned out of the coprocessor, which means bypass all further processing and
7789   *  return the proffered Result instead, or null which means proceed.
7790   */
7791  private Result doCoprocessorPreCall(final Operation op, final Mutation mutation)
7792  throws IOException {
7793    Result result = null;
7794    if (this.coprocessorHost != null) {
7795      switch(op) {
7796        case INCREMENT:
7797          result = this.coprocessorHost.preIncrementAfterRowLock((Increment)mutation);
7798          break;
7799        case APPEND:
7800          result = this.coprocessorHost.preAppendAfterRowLock((Append)mutation);
7801          break;
7802        default: throw new UnsupportedOperationException(op.toString());
7803      }
7804    }
7805    return result;
7806  }
7807
7808  /**
7809   * Reckon the Cells to apply to WAL, memstore, and to return to the Client; these Sets are not
7810   * always the same dependent on whether to write WAL.
7811   *
7812   * @param results Fill in here what goes back to the Client if it is non-null (if null, client
7813   *  doesn't want results).
7814   * @param forMemStore Fill in here what to apply to the MemStore (by Store).
7815   * @return A WALEdit to apply to WAL or null if we are to skip the WAL.
7816   */
7817  private WALEdit reckonDeltas(Operation op, Mutation mutation, Durability effectiveDurability,
7818      Map<HStore, List<Cell>> forMemStore, List<Cell> results) throws IOException {
7819    WALEdit walEdit = null;
7820    long now = EnvironmentEdgeManager.currentTime();
7821    final boolean writeToWAL = effectiveDurability != Durability.SKIP_WAL;
7822    // Process a Store/family at a time.
7823    for (Map.Entry<byte [], List<Cell>> entry: mutation.getFamilyCellMap().entrySet()) {
7824      final byte[] columnFamilyName = entry.getKey();
7825      List<Cell> deltas = entry.getValue();
7826      HStore store = this.stores.get(columnFamilyName);
7827      // Reckon for the Store what to apply to WAL and MemStore.
7828      List<Cell> toApply =
7829        reckonDeltasByStore(store, op, mutation, effectiveDurability, now, deltas, results);
7830      if (!toApply.isEmpty()) {
7831        forMemStore.put(store, toApply);
7832        if (writeToWAL) {
7833          if (walEdit == null) {
7834            walEdit = new WALEdit();
7835          }
7836          walEdit.getCells().addAll(toApply);
7837        }
7838      }
7839    }
7840    return walEdit;
7841  }
7842
7843  /**
7844   * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed
7845   * column family/Store.
7846   *
7847   * Does Get of current value and then adds passed in deltas for this Store returning the result.
7848   *
7849   * @param op Whether Increment or Append
7850   * @param mutation The encompassing Mutation object
7851   * @param deltas Changes to apply to this Store; either increment amount or data to append
7852   * @param results In here we accumulate all the Cells we are to return to the client. If null,
7853   *                client doesn't want results returned.
7854   * @return Resulting Cells after <code>deltas</code> have been applied to current
7855   *  values. Side effect is our filling out of the <code>results</code> List.
7856   */
7857  private List<Cell> reckonDeltasByStore(HStore store, Operation op, Mutation mutation,
7858      Durability effectiveDurability, long now, List<Cell> deltas, List<Cell> results)
7859      throws IOException {
7860    byte[] columnFamily = store.getColumnFamilyDescriptor().getName();
7861    List<Cell> toApply = new ArrayList<>(deltas.size());
7862    // Get previous values for all columns in this family.
7863    TimeRange tr = null;
7864    switch (op) {
7865      case INCREMENT:
7866        tr = ((Increment)mutation).getTimeRange();
7867        break;
7868      case APPEND:
7869        tr = ((Append)mutation).getTimeRange();
7870        break;
7871      default:
7872        break;
7873    }
7874    List<Cell> currentValues = get(mutation, store, deltas,null, tr);
7875    // Iterate the input columns and update existing values if they were found, otherwise
7876    // add new column initialized to the delta amount
7877    int currentValuesIndex = 0;
7878    for (int i = 0; i < deltas.size(); i++) {
7879      Cell delta = deltas.get(i);
7880      Cell currentValue = null;
7881      if (currentValuesIndex < currentValues.size() &&
7882          CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta)) {
7883        currentValue = currentValues.get(currentValuesIndex);
7884        if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) {
7885          currentValuesIndex++;
7886        }
7887      }
7888      // Switch on whether this an increment or an append building the new Cell to apply.
7889      Cell newCell = null;
7890      MutationType mutationType = null;
7891      switch (op) {
7892        case INCREMENT:
7893          mutationType = MutationType.INCREMENT;
7894          long deltaAmount = getLongValue(delta);
7895          final long newValue = currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount;
7896          newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) -> Bytes.toBytes(newValue));
7897          break;
7898        case APPEND:
7899          mutationType = MutationType.APPEND;
7900          newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, (oldCell) ->
7901            ByteBuffer.wrap(new byte[delta.getValueLength() + oldCell.getValueLength()])
7902                    .put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength())
7903                    .put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength())
7904                    .array()
7905          );
7906          break;
7907        default: throw new UnsupportedOperationException(op.toString());
7908      }
7909
7910      // Give coprocessors a chance to update the new cell
7911      if (coprocessorHost != null) {
7912        newCell =
7913            coprocessorHost.postMutationBeforeWAL(mutationType, mutation, currentValue, newCell);
7914      }
7915      toApply.add(newCell);
7916      // Add to results to get returned to the Client. If null, cilent does not want results.
7917      if (results != null) {
7918        results.add(newCell);
7919      }
7920    }
7921    return toApply;
7922  }
7923
7924  private static Cell reckonDelta(final Cell delta, final Cell currentCell,
7925                                  final byte[] columnFamily, final long now,
7926                                  Mutation mutation, Function<Cell, byte[]> supplier) throws IOException {
7927    // Forward any tags found on the delta.
7928    List<Tag> tags = TagUtil.carryForwardTags(delta);
7929    tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
7930    if (currentCell != null) {
7931      tags = TagUtil.carryForwardTags(tags, currentCell);
7932      byte[] newValue = supplier.apply(currentCell);
7933      return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
7934              .setRow(mutation.getRow(), 0, mutation.getRow().length)
7935              .setFamily(columnFamily, 0, columnFamily.length)
7936              // copy the qualifier if the cell is located in shared memory.
7937              .setQualifier(CellUtil.cloneQualifier(delta))
7938              .setTimestamp(Math.max(currentCell.getTimestamp() + 1, now))
7939              .setType(KeyValue.Type.Put.getCode())
7940              .setValue(newValue, 0, newValue.length)
7941              .setTags(TagUtil.fromList(tags))
7942              .build();
7943    } else {
7944      PrivateCellUtil.updateLatestStamp(delta, now);
7945      return CollectionUtils.isEmpty(tags) ? delta : PrivateCellUtil.createCell(delta, tags);
7946    }
7947  }
7948
7949  /**
7950   * @return Get the long out of the passed in Cell
7951   */
7952  private static long getLongValue(final Cell cell) throws DoNotRetryIOException {
7953    int len = cell.getValueLength();
7954    if (len != Bytes.SIZEOF_LONG) {
7955      // throw DoNotRetryIOException instead of IllegalArgumentException
7956      throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide");
7957    }
7958    return PrivateCellUtil.getValueAsLong(cell);
7959  }
7960
7961  /**
7962   * Do a specific Get on passed <code>columnFamily</code> and column qualifiers.
7963   * @param mutation Mutation we are doing this Get for.
7964   * @param store Which column family on row (TODO: Go all Gets in one go)
7965   * @param coordinates Cells from <code>mutation</code> used as coordinates applied to Get.
7966   * @return Return list of Cells found.
7967   */
7968  private List<Cell> get(Mutation mutation, HStore store, List<Cell> coordinates,
7969      IsolationLevel isolation, TimeRange tr) throws IOException {
7970    // Sort the cells so that they match the order that they appear in the Get results. Otherwise,
7971    // we won't be able to find the existing values if the cells are not specified in order by the
7972    // client since cells are in an array list.
7973    // TODO: I don't get why we are sorting. St.Ack 20150107
7974    sort(coordinates, store.getComparator());
7975    Get get = new Get(mutation.getRow());
7976    if (isolation != null) {
7977      get.setIsolationLevel(isolation);
7978    }
7979    for (Cell cell: coordinates) {
7980      get.addColumn(store.getColumnFamilyDescriptor().getName(), CellUtil.cloneQualifier(cell));
7981    }
7982    // Increments carry time range. If an Increment instance, put it on the Get.
7983    if (tr != null) {
7984      get.setTimeRange(tr.getMin(), tr.getMax());
7985    }
7986    return get(get, false);
7987  }
7988
7989  /**
7990   * @return Sorted list of <code>cells</code> using <code>comparator</code>
7991   */
7992  private static List<Cell> sort(List<Cell> cells, final CellComparator comparator) {
7993    cells.sort(comparator);
7994    return cells;
7995  }
7996
7997  //
7998  // New HBASE-880 Helpers
7999  //
8000
8001  void checkFamily(final byte [] family)
8002  throws NoSuchColumnFamilyException {
8003    if (!this.htableDescriptor.hasColumnFamily(family)) {
8004      throw new NoSuchColumnFamilyException("Column family " +
8005          Bytes.toString(family) + " does not exist in region " + this
8006          + " in table " + this.htableDescriptor);
8007    }
8008  }
8009
8010  public static final long FIXED_OVERHEAD = ClassSize.align(
8011      ClassSize.OBJECT +
8012      ClassSize.ARRAY +
8013      53 * ClassSize.REFERENCE + 3 * Bytes.SIZEOF_INT +
8014      (14 * Bytes.SIZEOF_LONG) +
8015      3 * Bytes.SIZEOF_BOOLEAN);
8016
8017  // woefully out of date - currently missing:
8018  // 1 x HashMap - coprocessorServiceHandlers
8019  // 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
8020  //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
8021  //   writeRequestsCount
8022  // 1 x HRegion$WriteState - writestate
8023  // 1 x RegionCoprocessorHost - coprocessorHost
8024  // 1 x RegionSplitPolicy - splitPolicy
8025  // 1 x MetricsRegion - metricsRegion
8026  // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
8027  public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
8028      ClassSize.OBJECT + // closeLock
8029      (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
8030      (3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL,
8031                                    // compactionsFailed
8032      (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
8033      WriteState.HEAP_SIZE + // writestate
8034      ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
8035      (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
8036      MultiVersionConcurrencyControl.FIXED_SIZE // mvcc
8037      + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes
8038      + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
8039      + ClassSize.STORE_SERVICES // store services
8040      ;
8041
8042  @Override
8043  public long heapSize() {
8044    // this does not take into account row locks, recent flushes, mvcc entries, and more
8045    return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum();
8046  }
8047
8048  /**
8049   * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to
8050   * be available for handling Region#execService(com.google.protobuf.RpcController,
8051   *    org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall) calls.
8052   *
8053   * <p>
8054   * Only a single instance may be registered per region for a given {@link Service} subclass (the
8055   * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
8056   * After the first registration, subsequent calls with the same service name will fail with
8057   * a return value of {@code false}.
8058   * </p>
8059   * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
8060   * @return {@code true} if the registration was successful, {@code false}
8061   * otherwise
8062   */
8063  public boolean registerService(com.google.protobuf.Service instance) {
8064    /*
8065     * No stacking of instances is allowed for a single service name
8066     */
8067    com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
8068    String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc);
8069    if (coprocessorServiceHandlers.containsKey(serviceName)) {
8070      LOG.error("Coprocessor service " + serviceName +
8071          " already registered, rejecting request from " + instance);
8072      return false;
8073    }
8074
8075    coprocessorServiceHandlers.put(serviceName, instance);
8076    if (LOG.isDebugEnabled()) {
8077      LOG.debug("Registered coprocessor service: region=" +
8078          Bytes.toStringBinary(getRegionInfo().getRegionName()) +
8079          " service=" + serviceName);
8080    }
8081    return true;
8082  }
8083
8084  /**
8085   * Executes a single protocol buffer coprocessor endpoint {@link Service} method using
8086   * the registered protocol handlers.  {@link Service} implementations must be registered via the
8087   * {@link #registerService(com.google.protobuf.Service)}
8088   * method before they are available.
8089   *
8090   * @param controller an {@code RpcContoller} implementation to pass to the invoked service
8091   * @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
8092   *     and parameters for the method invocation
8093   * @return a protocol buffer {@code Message} instance containing the method's result
8094   * @throws IOException if no registered service handler is found or an error
8095   *     occurs during the invocation
8096   * @see #registerService(com.google.protobuf.Service)
8097   */
8098  public com.google.protobuf.Message execService(com.google.protobuf.RpcController controller,
8099      CoprocessorServiceCall call) throws IOException {
8100    String serviceName = call.getServiceName();
8101    com.google.protobuf.Service service = coprocessorServiceHandlers.get(serviceName);
8102    if (service == null) {
8103      throw new UnknownProtocolException(null, "No registered coprocessor service found for " +
8104          serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName()));
8105    }
8106    com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
8107
8108    String methodName = call.getMethodName();
8109    com.google.protobuf.Descriptors.MethodDescriptor methodDesc =
8110        CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc);
8111
8112    com.google.protobuf.Message.Builder builder =
8113        service.getRequestPrototype(methodDesc).newBuilderForType();
8114
8115    org.apache.hadoop.hbase.protobuf.ProtobufUtil.mergeFrom(builder,
8116        call.getRequest().toByteArray());
8117    com.google.protobuf.Message request =
8118        CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest());
8119
8120    if (coprocessorHost != null) {
8121      request = coprocessorHost.preEndpointInvocation(service, methodName, request);
8122    }
8123
8124    final com.google.protobuf.Message.Builder responseBuilder =
8125        service.getResponsePrototype(methodDesc).newBuilderForType();
8126    service.callMethod(methodDesc, controller, request,
8127        new com.google.protobuf.RpcCallback<com.google.protobuf.Message>() {
8128      @Override
8129      public void run(com.google.protobuf.Message message) {
8130        if (message != null) {
8131          responseBuilder.mergeFrom(message);
8132        }
8133      }
8134    });
8135
8136    if (coprocessorHost != null) {
8137      coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
8138    }
8139    IOException exception =
8140        org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller);
8141    if (exception != null) {
8142      throw exception;
8143    }
8144
8145    return responseBuilder.build();
8146  }
8147
8148  boolean shouldForceSplit() {
8149    return this.splitRequest;
8150  }
8151
8152  byte[] getExplicitSplitPoint() {
8153    return this.explicitSplitPoint;
8154  }
8155
8156  void forceSplit(byte[] sp) {
8157    // This HRegion will go away after the forced split is successful
8158    // But if a forced split fails, we need to clear forced split.
8159    this.splitRequest = true;
8160    if (sp != null) {
8161      this.explicitSplitPoint = sp;
8162    }
8163  }
8164
8165  void clearSplit() {
8166    this.splitRequest = false;
8167    this.explicitSplitPoint = null;
8168  }
8169
8170  /**
8171   * Return the splitpoint. null indicates the region isn't splittable
8172   * If the splitpoint isn't explicitly specified, it will go over the stores
8173   * to find the best splitpoint. Currently the criteria of best splitpoint
8174   * is based on the size of the store.
8175   */
8176  public byte[] checkSplit() {
8177    // Can't split META
8178    if (this.getRegionInfo().isMetaRegion() ||
8179        TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
8180      if (shouldForceSplit()) {
8181        LOG.warn("Cannot split meta region in HBase 0.20 and above");
8182      }
8183      return null;
8184    }
8185
8186    // Can't split a region that is closing.
8187    if (this.isClosing()) {
8188      return null;
8189    }
8190
8191    if (!splitPolicy.shouldSplit()) {
8192      return null;
8193    }
8194
8195    byte[] ret = splitPolicy.getSplitPoint();
8196
8197    if (ret != null) {
8198      try {
8199        checkRow(ret, "calculated split");
8200      } catch (IOException e) {
8201        LOG.error("Ignoring invalid split", e);
8202        return null;
8203      }
8204    }
8205    return ret;
8206  }
8207
8208  /**
8209   * @return The priority that this region should have in the compaction queue
8210   */
8211  public int getCompactPriority() {
8212    return stores.values().stream().mapToInt(HStore::getCompactPriority).min()
8213        .orElse(Store.NO_PRIORITY);
8214  }
8215
8216  /** @return the coprocessor host */
8217  public RegionCoprocessorHost getCoprocessorHost() {
8218    return coprocessorHost;
8219  }
8220
8221  /** @param coprocessorHost the new coprocessor host */
8222  @VisibleForTesting
8223  public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
8224    this.coprocessorHost = coprocessorHost;
8225  }
8226
8227  @Override
8228  public void startRegionOperation() throws IOException {
8229    startRegionOperation(Operation.ANY);
8230  }
8231
8232  @Override
8233  public void startRegionOperation(Operation op) throws IOException {
8234    switch (op) {
8235      case GET:  // read operations
8236      case SCAN:
8237        checkReadsEnabled();
8238        break;
8239      default:
8240        break;
8241    }
8242    if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
8243        || op == Operation.COMPACT_REGION) {
8244      // split, merge or compact region doesn't need to check the closing/closed state or lock the
8245      // region
8246      return;
8247    }
8248    if (this.closing.get()) {
8249      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
8250    }
8251    lock(lock.readLock());
8252    if (this.closed.get()) {
8253      lock.readLock().unlock();
8254      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
8255    }
8256    // The unit for snapshot is a region. So, all stores for this region must be
8257    // prepared for snapshot operation before proceeding.
8258    if (op == Operation.SNAPSHOT) {
8259      stores.values().forEach(HStore::preSnapshotOperation);
8260    }
8261    try {
8262      if (coprocessorHost != null) {
8263        coprocessorHost.postStartRegionOperation(op);
8264      }
8265    } catch (Exception e) {
8266      lock.readLock().unlock();
8267      throw new IOException(e);
8268    }
8269  }
8270
8271  @Override
8272  public void closeRegionOperation() throws IOException {
8273    closeRegionOperation(Operation.ANY);
8274  }
8275
8276  @Override
8277  public void closeRegionOperation(Operation operation) throws IOException {
8278    if (operation == Operation.SNAPSHOT) {
8279      stores.values().forEach(HStore::postSnapshotOperation);
8280    }
8281    lock.readLock().unlock();
8282    if (coprocessorHost != null) {
8283      coprocessorHost.postCloseRegionOperation(operation);
8284    }
8285  }
8286
8287  /**
8288   * This method needs to be called before any public call that reads or
8289   * modifies stores in bulk. It has to be called just before a try.
8290   * #closeBulkRegionOperation needs to be called in the try's finally block
8291   * Acquires a writelock and checks if the region is closing or closed.
8292   * @throws NotServingRegionException when the region is closing or closed
8293   * @throws RegionTooBusyException if failed to get the lock in time
8294   * @throws InterruptedIOException if interrupted while waiting for a lock
8295   */
8296  private void startBulkRegionOperation(boolean writeLockNeeded)
8297      throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
8298    if (this.closing.get()) {
8299      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
8300    }
8301    if (writeLockNeeded) lock(lock.writeLock());
8302    else lock(lock.readLock());
8303    if (this.closed.get()) {
8304      if (writeLockNeeded) lock.writeLock().unlock();
8305      else lock.readLock().unlock();
8306      throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
8307    }
8308  }
8309
8310  /**
8311   * Closes the lock. This needs to be called in the finally block corresponding
8312   * to the try block of #startRegionOperation
8313   */
8314  private void closeBulkRegionOperation(){
8315    if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
8316    else lock.readLock().unlock();
8317  }
8318
8319  /**
8320   * Update LongAdders for number of puts without wal and the size of possible data loss.
8321   * These information are exposed by the region server metrics.
8322   */
8323  private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
8324    numMutationsWithoutWAL.increment();
8325    if (numMutationsWithoutWAL.sum() <= 1) {
8326      LOG.info("writing data to region " + this +
8327               " with WAL disabled. Data may be lost in the event of a crash.");
8328    }
8329
8330    long mutationSize = 0;
8331    for (List<Cell> cells: familyMap.values()) {
8332      // Optimization: 'foreach' loop is not used. See:
8333      // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects
8334      assert cells instanceof RandomAccess;
8335      int listSize = cells.size();
8336      for (int i=0; i < listSize; i++) {
8337        Cell cell = cells.get(i);
8338        mutationSize += KeyValueUtil.length(cell);
8339      }
8340    }
8341
8342    dataInMemoryWithoutWAL.add(mutationSize);
8343  }
8344
8345  private void lock(final Lock lock) throws RegionTooBusyException, InterruptedIOException {
8346    lock(lock, 1);
8347  }
8348
8349  /**
8350   * Try to acquire a lock.  Throw RegionTooBusyException
8351   * if failed to get the lock in time. Throw InterruptedIOException
8352   * if interrupted while waiting for the lock.
8353   */
8354  private void lock(final Lock lock, final int multiplier)
8355      throws RegionTooBusyException, InterruptedIOException {
8356    try {
8357      final long waitTime = Math.min(maxBusyWaitDuration,
8358          busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
8359      if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
8360        // Don't print millis. Message is used as a key over in
8361        // RetriesExhaustedWithDetailsException processing.
8362        throw new RegionTooBusyException("Failed to obtain lock; regionName=" +
8363            (this.getRegionInfo() == null? "unknown":
8364                this.getRegionInfo().getRegionNameAsString()) +
8365            ", server=" + (this.getRegionServerServices() == null? "unknown":
8366                this.getRegionServerServices().getServerName()));
8367      }
8368    } catch (InterruptedException ie) {
8369      LOG.info("Interrupted while waiting for a lock");
8370      InterruptedIOException iie = new InterruptedIOException();
8371      iie.initCause(ie);
8372      throw iie;
8373    }
8374  }
8375
8376  /**
8377   * Calls sync with the given transaction ID
8378   * @param txid should sync up to which transaction
8379   * @throws IOException If anything goes wrong with DFS
8380   */
8381  private void sync(long txid, Durability durability) throws IOException {
8382    if (this.getRegionInfo().isMetaRegion()) {
8383      this.wal.sync(txid);
8384    } else {
8385      switch(durability) {
8386      case USE_DEFAULT:
8387        // do what table defaults to
8388        if (shouldSyncWAL()) {
8389          this.wal.sync(txid);
8390        }
8391        break;
8392      case SKIP_WAL:
8393        // nothing do to
8394        break;
8395      case ASYNC_WAL:
8396        // nothing do to
8397        break;
8398      case SYNC_WAL:
8399      case FSYNC_WAL:
8400        // sync the WAL edit (SYNC and FSYNC treated the same for now)
8401        this.wal.sync(txid);
8402        break;
8403      default:
8404        throw new RuntimeException("Unknown durability " + durability);
8405      }
8406    }
8407  }
8408
8409  /**
8410   * Check whether we should sync the wal from the table's durability settings
8411   */
8412  private boolean shouldSyncWAL() {
8413    return regionDurability.ordinal() >  Durability.ASYNC_WAL.ordinal();
8414  }
8415
8416  /**
8417   * A mocked list implementation - discards all updates.
8418   */
8419  private static final List<Cell> MOCKED_LIST = new AbstractList<Cell>() {
8420
8421    @Override
8422    public void add(int index, Cell element) {
8423      // do nothing
8424    }
8425
8426    @Override
8427    public boolean addAll(int index, Collection<? extends Cell> c) {
8428      return false; // this list is never changed as a result of an update
8429    }
8430
8431    @Override
8432    public KeyValue get(int index) {
8433      throw new UnsupportedOperationException();
8434    }
8435
8436    @Override
8437    public int size() {
8438      return 0;
8439    }
8440  };
8441
8442  /** @return the latest sequence number that was read from storage when this region was opened */
8443  public long getOpenSeqNum() {
8444    return this.openSeqNum;
8445  }
8446
8447  @Override
8448  public Map<byte[], Long> getMaxStoreSeqId() {
8449    return this.maxSeqIdInStores;
8450  }
8451
8452  public long getOldestSeqIdOfStore(byte[] familyName) {
8453    return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName);
8454  }
8455
8456  @Override
8457  public CompactionState getCompactionState() {
8458    boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
8459    return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
8460        : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
8461  }
8462
8463  public void reportCompactionRequestStart(boolean isMajor){
8464    (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
8465  }
8466
8467  public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) {
8468    int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
8469
8470    // metrics
8471    compactionsFinished.increment();
8472    compactionNumFilesCompacted.add(numFiles);
8473    compactionNumBytesCompacted.add(filesSizeCompacted);
8474
8475    assert newValue >= 0;
8476  }
8477
8478  public void reportCompactionRequestFailure() {
8479    compactionsFailed.increment();
8480  }
8481
8482  public void incrementCompactionsQueuedCount() {
8483    compactionsQueued.increment();
8484  }
8485
8486  public void decrementCompactionsQueuedCount() {
8487    compactionsQueued.decrement();
8488  }
8489
8490  public void incrementFlushesQueuedCount() {
8491    flushesQueued.increment();
8492  }
8493
8494  @VisibleForTesting
8495  public long getReadPoint() {
8496    return getReadPoint(IsolationLevel.READ_COMMITTED);
8497  }
8498
8499  /**
8500   * {@inheritDoc}
8501   */
8502  @Override
8503  public void onConfigurationChange(Configuration conf) {
8504    // Do nothing for now.
8505  }
8506
8507  /**
8508   * {@inheritDoc}
8509   */
8510  @Override
8511  public void registerChildren(ConfigurationManager manager) {
8512    configurationManager = Optional.of(manager);
8513    stores.values().forEach(manager::registerObserver);
8514  }
8515
8516  /**
8517   * {@inheritDoc}
8518   */
8519  @Override
8520  public void deregisterChildren(ConfigurationManager manager) {
8521    stores.values().forEach(configurationManager.get()::deregisterObserver);
8522  }
8523
8524  @Override
8525  public CellComparator getCellComparator() {
8526    return this.getRegionInfo().isMetaRegion() ? CellComparatorImpl.META_COMPARATOR
8527        : CellComparatorImpl.COMPARATOR;
8528  }
8529
8530  public long getMemStoreFlushSize() {
8531    return this.memstoreFlushSize;
8532  }
8533
8534
8535  //// method for debugging tests
8536  void throwException(String title, String regionName) {
8537    StringBuilder buf = new StringBuilder();
8538    buf.append(title + ", ");
8539    buf.append(getRegionInfo().toString());
8540    buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " ");
8541    buf.append("stores: ");
8542    for (HStore s : stores.values()) {
8543      buf.append(s.getColumnFamilyDescriptor().getNameAsString());
8544      buf.append(" size: ");
8545      buf.append(s.getMemStoreSize().getDataSize());
8546      buf.append(" ");
8547    }
8548    buf.append("end-of-stores");
8549    buf.append(", memstore size ");
8550    buf.append(getMemStoreDataSize());
8551    if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) {
8552      throw new RuntimeException(buf.toString());
8553    }
8554  }
8555
8556  @Override
8557  public void requestCompaction(String why, int priority, boolean major,
8558      CompactionLifeCycleTracker tracker) throws IOException {
8559    if (major) {
8560      stores.values().forEach(HStore::triggerMajorCompaction);
8561    }
8562    rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker,
8563      RpcServer.getRequestUser().orElse(null));
8564  }
8565
8566  @Override
8567  public void requestCompaction(byte[] family, String why, int priority, boolean major,
8568      CompactionLifeCycleTracker tracker) throws IOException {
8569    HStore store = stores.get(family);
8570    if (store == null) {
8571      throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family) +
8572          " does not exist in region " + getRegionInfo().getRegionNameAsString());
8573    }
8574    if (major) {
8575      store.triggerMajorCompaction();
8576    }
8577    rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker,
8578      RpcServer.getRequestUser().orElse(null));
8579  }
8580
8581  private void requestFlushIfNeeded() throws RegionTooBusyException {
8582    if(isFlushSize(this.memStoreSizing.getMemStoreSize())) {
8583      requestFlush();
8584    }
8585  }
8586
8587  private void requestFlush() {
8588    if (this.rsServices == null) {
8589      return;
8590    }
8591    requestFlush0(FlushLifeCycleTracker.DUMMY);
8592  }
8593
8594  private void requestFlush0(FlushLifeCycleTracker tracker) {
8595    boolean shouldFlush = false;
8596    synchronized (writestate) {
8597      if (!this.writestate.isFlushRequested()) {
8598        shouldFlush = true;
8599        writestate.flushRequested = true;
8600      }
8601    }
8602    if (shouldFlush) {
8603      // Make request outside of synchronize block; HBASE-818.
8604      this.rsServices.getFlushRequester().requestFlush(this, false, tracker);
8605      if (LOG.isDebugEnabled()) {
8606        LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName());
8607      }
8608    } else {
8609      tracker.notExecuted("Flush already requested on " + this);
8610    }
8611  }
8612
8613  @Override
8614  public void requestFlush(FlushLifeCycleTracker tracker) throws IOException {
8615    requestFlush0(tracker);
8616  }
8617
8618  /**
8619   * This method modifies the region's configuration in order to inject replication-related
8620   * features
8621   * @param conf region configurations
8622   */
8623  static void decorateRegionConfiguration(Configuration conf) {
8624    if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) {
8625      String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,"");
8626      String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName();
8627      if (!plugins.contains(replicationCoprocessorClass)) {
8628        conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
8629            (plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass);
8630      }
8631    }
8632  }
8633}