View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.regionserver;
19  
20  import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL;
21  
22  import com.google.common.annotations.VisibleForTesting;
23  import com.google.common.base.Optional;
24  import com.google.common.base.Preconditions;
25  import com.google.common.collect.Lists;
26  import com.google.common.collect.Maps;
27  import com.google.common.io.Closeables;
28  import com.google.protobuf.ByteString;
29  import com.google.protobuf.Descriptors;
30  import com.google.protobuf.Message;
31  import com.google.protobuf.RpcCallback;
32  import com.google.protobuf.RpcController;
33  import com.google.protobuf.Service;
34  import com.google.protobuf.TextFormat;
35  import java.io.EOFException;
36  import java.io.FileNotFoundException;
37  import java.io.IOException;
38  import java.io.InterruptedIOException;
39  import java.lang.reflect.Constructor;
40  import java.text.ParseException;
41  import java.util.AbstractList;
42  import java.util.ArrayList;
43  import java.util.Arrays;
44  import java.util.Collection;
45  import java.util.Collections;
46  import java.util.Comparator;
47  import java.util.HashMap;
48  import java.util.HashSet;
49  import java.util.Iterator;
50  import java.util.List;
51  import java.util.Map;
52  import java.util.Map.Entry;
53  import java.util.NavigableMap;
54  import java.util.NavigableSet;
55  import java.util.RandomAccess;
56  import java.util.Set;
57  import java.util.TreeMap;
58  import java.util.UUID;
59  import java.util.concurrent.Callable;
60  import java.util.concurrent.CompletionService;
61  import java.util.concurrent.ConcurrentHashMap;
62  import java.util.concurrent.ConcurrentMap;
63  import java.util.concurrent.ConcurrentSkipListMap;
64  import java.util.concurrent.ExecutionException;
65  import java.util.concurrent.ExecutorCompletionService;
66  import java.util.concurrent.ExecutorService;
67  import java.util.concurrent.Executors;
68  import java.util.concurrent.Future;
69  import java.util.concurrent.FutureTask;
70  import java.util.concurrent.ThreadFactory;
71  import java.util.concurrent.ThreadPoolExecutor;
72  import java.util.concurrent.TimeUnit;
73  import java.util.concurrent.TimeoutException;
74  import java.util.concurrent.atomic.AtomicBoolean;
75  import java.util.concurrent.atomic.AtomicInteger;
76  import java.util.concurrent.atomic.AtomicLong;
77  import java.util.concurrent.locks.Lock;
78  import java.util.concurrent.locks.ReadWriteLock;
79  import java.util.concurrent.locks.ReentrantReadWriteLock;
80
81  import org.apache.commons.logging.Log;
82  import org.apache.commons.logging.LogFactory;
83  import org.apache.hadoop.conf.Configuration;
84  import org.apache.hadoop.fs.FileStatus;
85  import org.apache.hadoop.fs.FileSystem;
86  import org.apache.hadoop.fs.Path;
87  import org.apache.hadoop.hbase.Cell;
88  import org.apache.hadoop.hbase.CellComparator;
89  import org.apache.hadoop.hbase.CellScanner;
90  import org.apache.hadoop.hbase.CellUtil;
91  import org.apache.hadoop.hbase.CompoundConfiguration;
92  import org.apache.hadoop.hbase.DoNotRetryIOException;
93  import org.apache.hadoop.hbase.DroppedSnapshotException;
94  import org.apache.hadoop.hbase.HColumnDescriptor;
95  import org.apache.hadoop.hbase.HConstants;
96  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
97  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
98  import org.apache.hadoop.hbase.HRegionInfo;
99  import org.apache.hadoop.hbase.HTableDescriptor;
100 import org.apache.hadoop.hbase.KeyValue;
101 import org.apache.hadoop.hbase.KeyValueUtil;
102 import org.apache.hadoop.hbase.NamespaceDescriptor;
103 import org.apache.hadoop.hbase.NotServingRegionException;
104 import org.apache.hadoop.hbase.RegionTooBusyException;
105 import org.apache.hadoop.hbase.TableName;
106 import org.apache.hadoop.hbase.Tag;
107 import org.apache.hadoop.hbase.TagRewriteCell;
108 import org.apache.hadoop.hbase.TagUtil;
109 import org.apache.hadoop.hbase.UnknownScannerException;
110 import org.apache.hadoop.hbase.backup.HFileArchiver;
111 import org.apache.hadoop.hbase.classification.InterfaceAudience;
112 import org.apache.hadoop.hbase.client.Append;
113 import org.apache.hadoop.hbase.client.Delete;
114 import org.apache.hadoop.hbase.client.Durability;
115 import org.apache.hadoop.hbase.client.Get;
116 import org.apache.hadoop.hbase.client.Increment;
117 import org.apache.hadoop.hbase.client.IsolationLevel;
118 import org.apache.hadoop.hbase.client.Mutation;
119 import org.apache.hadoop.hbase.client.Put;
120 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
121 import org.apache.hadoop.hbase.client.Result;
122 import org.apache.hadoop.hbase.client.RowMutations;
123 import org.apache.hadoop.hbase.client.Scan;
124 import org.apache.hadoop.hbase.conf.ConfigurationManager;
125 import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
126 import org.apache.hadoop.hbase.coprocessor.RegionObserver.MutationType;
127 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
128 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
129 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
130 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
131 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
132 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
133 import org.apache.hadoop.hbase.filter.FilterWrapper;
134 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
135 import org.apache.hadoop.hbase.io.HeapSize;
136 import org.apache.hadoop.hbase.io.TimeRange;
137 import org.apache.hadoop.hbase.io.hfile.HFile;
138 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
139 import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
140 import org.apache.hadoop.hbase.ipc.RpcCallContext;
141 import org.apache.hadoop.hbase.ipc.RpcServer;
142 import org.apache.hadoop.hbase.mob.MobUtils;
143 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
144 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
145 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
146 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
147 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
148 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
149 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
150 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
151 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
152 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
153 import org.apache.hadoop.hbase.protobuf.generated.WALProtos;
154 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
155 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor;
156 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
157 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
158 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor;
159 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
160 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.StoreDescriptor;
161 import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry;
162 import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
163 import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
164 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
165 import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
166 import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
167 import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
168 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
169 import org.apache.hadoop.hbase.regionserver.wal.ReplayHLogKey;
170 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
171 import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
172 import org.apache.hadoop.hbase.security.User;
173 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
174 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
175 import org.apache.hadoop.hbase.util.ByteStringer;
176 import org.apache.hadoop.hbase.util.Bytes;
177 import org.apache.hadoop.hbase.util.CancelableProgressable;
178 import org.apache.hadoop.hbase.util.ClassSize;
179 import org.apache.hadoop.hbase.util.CompressionTest;
180 import org.apache.hadoop.hbase.util.Counter;
181 import org.apache.hadoop.hbase.util.EncryptionTest;
182 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
183 import org.apache.hadoop.hbase.util.FSUtils;
184 import org.apache.hadoop.hbase.util.HashedBytes;
185 import org.apache.hadoop.hbase.util.Pair;
186 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
187 import org.apache.hadoop.hbase.util.Threads;
188 import org.apache.hadoop.hbase.wal.WAL;
189 import org.apache.hadoop.hbase.wal.WALFactory;
190 import org.apache.hadoop.hbase.wal.WALKey;
191 import org.apache.hadoop.hbase.wal.WALSplitter;
192 import org.apache.hadoop.hbase.wal.WALSplitter.MutationReplay;
193 import org.apache.hadoop.io.MultipleIOException;
194 import org.apache.hadoop.util.StringUtils;
195 import org.apache.htrace.Trace;
196 import org.apache.htrace.TraceScope;
197
198
199 @SuppressWarnings("deprecation")
200 @InterfaceAudience.Private
201 public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
202   private static final Log LOG = LogFactory.getLog(HRegion.class);
203
204   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
205     "hbase.hregion.scan.loadColumnFamiliesOnDemand";
206
207   /**
208    * This is the global default value for durability. All tables/mutations not
209    * defining a durability or using USE_DEFAULT will default to this value.
210    */
211   private static final Durability DEFAULT_DURABILITY = Durability.SYNC_WAL;
212
213   final AtomicBoolean closed = new AtomicBoolean(false);
214
215   /* Closing can take some time; use the closing flag if there is stuff we don't
216    * want to do while in closing state; e.g. like offer this region up to the
217    * master as a region to close if the carrying regionserver is overloaded.
218    * Once set, it is never cleared.
219    */
220   final AtomicBoolean closing = new AtomicBoolean(false);
221
222   /**
223    * The max sequence id of flushed data on this region. There is no edit in memory that is
224    * less that this sequence id.
225    */
226   private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
227
228   /**
229    * Record the sequence id of last flush operation. Can be in advance of
230    * {@link #maxFlushedSeqId} when flushing a single column family. In this case,
231    * {@link #maxFlushedSeqId} will be older than the oldest edit in memory.
232    */
233   private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
234
235   /**
236    * The sequence id of the last replayed open region event from the primary region. This is used
237    * to skip entries before this due to the possibility of replay edits coming out of order from
238    * replication.
239    */
240   protected volatile long lastReplayedOpenRegionSeqId = -1L;
241   protected volatile long lastReplayedCompactionSeqId = -1L;
242
243   //////////////////////////////////////////////////////////////////////////////
244   // Members
245   //////////////////////////////////////////////////////////////////////////////
246
247   // map from a locked row to the context for that lock including:
248   // - CountDownLatch for threads waiting on that row
249   // - the thread that owns the lock (allow reentrancy)
250   // - reference count of (reentrant) locks held by the thread
251   // - the row itself
252   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
253       new ConcurrentHashMap<HashedBytes, RowLockContext>();
254
255   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
256       Bytes.BYTES_RAWCOMPARATOR);
257
258   // TODO: account for each registered handler in HeapSize computation
259   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
260
261   private final AtomicLong memstoreSize = new AtomicLong(0);
262   private final RegionServicesForStores regionServicesForStores = new RegionServicesForStores(this);
263
264   // Debug possible data loss due to WAL off
265   final Counter numMutationsWithoutWAL = new Counter();
266   final Counter dataInMemoryWithoutWAL = new Counter();
267
268   // Debug why CAS operations are taking a while.
269   final Counter checkAndMutateChecksPassed = new Counter();
270   final Counter checkAndMutateChecksFailed = new Counter();
271
272   // Number of requests
273   final Counter readRequestsCount = new Counter();
274   final Counter filteredReadRequestsCount = new Counter();
275   final Counter writeRequestsCount = new Counter();
276
277   // Number of requests blocked by memstore size.
278   private final Counter blockedRequestsCount = new Counter();
279
280   // Compaction counters
281   final AtomicLong compactionsFinished = new AtomicLong(0L);
282   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
283   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
284
285   private final WAL wal;
286   private final HRegionFileSystem fs;
287   protected final Configuration conf;
288   private final Configuration baseConf;
289   private final int rowLockWaitDuration;
290   private CompactedHFilesDischarger compactedFileDischarger;
291   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
292
293   // The internal wait duration to acquire a lock before read/update
294   // from the region. It is not per row. The purpose of this wait time
295   // is to avoid waiting a long time while the region is busy, so that
296   // we can release the IPC handler soon enough to improve the
297   // availability of the region server. It can be adjusted by
298   // tuning configuration "hbase.busy.wait.duration".
299   final long busyWaitDuration;
300   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
301
302   // If updating multiple rows in one call, wait longer,
303   // i.e. waiting for busyWaitDuration * # of rows. However,
304   // we can limit the max multiplier.
305   final int maxBusyWaitMultiplier;
306
307   // Max busy wait duration. There is no point to wait longer than the RPC
308   // purge timeout, when a RPC call will be terminated by the RPC engine.
309   final long maxBusyWaitDuration;
310
311   // negative number indicates infinite timeout
312   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
313   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
314
315   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
316
317   /**
318    * The sequence ID that was encountered when this region was opened.
319    */
320   private long openSeqNum = HConstants.NO_SEQNUM;
321
322   /**
323    * The default setting for whether to enable on-demand CF loading for
324    * scan requests to this region. Requests can override it.
325    */
326   private boolean isLoadingCfsOnDemandDefault = false;
327
328   private final AtomicInteger majorInProgress = new AtomicInteger(0);
329   private final AtomicInteger minorInProgress = new AtomicInteger(0);
330
331   //
332   // Context: During replay we want to ensure that we do not lose any data. So, we
333   // have to be conservative in how we replay wals. For each store, we calculate
334   // the maxSeqId up to which the store was flushed. And, skip the edits which
335   // are equal to or lower than maxSeqId for each store.
336   // The following map is populated when opening the region
337   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
338
339   /** Saved state from replaying prepare flush cache */
340   private PrepareFlushResult prepareFlushResult = null;
341
342   /**
343    * Config setting for whether to allow writes when a region is in recovering or not.
344    */
345   private boolean disallowWritesInRecovering = false;
346
347   // When a region is in recovering state, it can only accept writes not reads
348   private volatile boolean recovering = false;
349
350   private volatile Optional<ConfigurationManager> configurationManager;
351
352   /**
353    * @return The smallest mvcc readPoint across all the scanners in this
354    * region. Writes older than this readPoint, are included in every
355    * read operation.
356    */
357   public long getSmallestReadPoint() {
358     long minimumReadPoint;
359     // We need to ensure that while we are calculating the smallestReadPoint
360     // no new RegionScanners can grab a readPoint that we are unaware of.
361     // We achieve this by synchronizing on the scannerReadPoints object.
362     synchronized(scannerReadPoints) {
363       minimumReadPoint = mvcc.getReadPoint();
364       for (Long readPoint: this.scannerReadPoints.values()) {
365         if (readPoint < minimumReadPoint) {
366           minimumReadPoint = readPoint;
367         }
368       }
369     }
370     return minimumReadPoint;
371   }
372
373   /*
374    * Data structure of write state flags used coordinating flushes,
375    * compactions and closes.
376    */
377   static class WriteState {
378     // Set while a memstore flush is happening.
379     volatile boolean flushing = false;
380     // Set when a flush has been requested.
381     volatile boolean flushRequested = false;
382     // Number of compactions running.
383     AtomicInteger compacting = new AtomicInteger(0);
384     // Gets set in close. If set, cannot compact or flush again.
385     volatile boolean writesEnabled = true;
386     // Set if region is read-only
387     volatile boolean readOnly = false;
388     // whether the reads are enabled. This is different than readOnly, because readOnly is
389     // static in the lifetime of the region, while readsEnabled is dynamic
390     volatile boolean readsEnabled = true;
391
392     /**
393      * Set flags that make this region read-only.
394      *
395      * @param onOff flip value for region r/o setting
396      */
397     synchronized void setReadOnly(final boolean onOff) {
398       this.writesEnabled = !onOff;
399       this.readOnly = onOff;
400     }
401
402     boolean isReadOnly() {
403       return this.readOnly;
404     }
405
406     boolean isFlushRequested() {
407       return this.flushRequested;
408     }
409
410     void setReadsEnabled(boolean readsEnabled) {
411       this.readsEnabled = readsEnabled;
412     }
413
414     static final long HEAP_SIZE = ClassSize.align(
415         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
416   }
417
418   /**
419    * Objects from this class are created when flushing to describe all the different states that
420    * that method ends up in. The Result enum describes those states. The sequence id should only
421    * be specified if the flush was successful, and the failure message should only be specified
422    * if it didn't flush.
423    */
424   public static class FlushResultImpl implements FlushResult {
425     final Result result;
426     final String failureReason;
427     final long flushSequenceId;
428     final boolean wroteFlushWalMarker;
429
430     /**
431      * Convenience constructor to use when the flush is successful, the failure message is set to
432      * null.
433      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
434      * @param flushSequenceId Generated sequence id that comes right after the edits in the
435      *                        memstores.
436      */
437     FlushResultImpl(Result result, long flushSequenceId) {
438       this(result, flushSequenceId, null, false);
439       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
440           .FLUSHED_COMPACTION_NEEDED;
441     }
442
443     /**
444      * Convenience constructor to use when we cannot flush.
445      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
446      * @param failureReason Reason why we couldn't flush.
447      */
448     FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
449       this(result, -1, failureReason, wroteFlushMarker);
450       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
451     }
452
453     /**
454      * Constructor with all the parameters.
455      * @param result Any of the Result.
456      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
457      * @param failureReason Reason why we couldn't flush, or null.
458      */
459     FlushResultImpl(Result result, long flushSequenceId, String failureReason,
460       boolean wroteFlushMarker) {
461       this.result = result;
462       this.flushSequenceId = flushSequenceId;
463       this.failureReason = failureReason;
464       this.wroteFlushWalMarker = wroteFlushMarker;
465     }
466
467     /**
468      * Convenience method, the equivalent of checking if result is
469      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
470      * @return true if the memstores were flushed, else false.
471      */
472     @Override
473     public boolean isFlushSucceeded() {
474       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
475           .FLUSHED_COMPACTION_NEEDED;
476     }
477
478     /**
479      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
480      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
481      */
482     @Override
483     public boolean isCompactionNeeded() {
484       return result == Result.FLUSHED_COMPACTION_NEEDED;
485     }
486
487     @Override
488     public String toString() {
489       return new StringBuilder()
490         .append("flush result:").append(result).append(", ")
491         .append("failureReason:").append(failureReason).append(",")
492         .append("flush seq id").append(flushSequenceId).toString();
493     }
494
495     @Override
496     public Result getResult() {
497       return result;
498     }
499   }
500
501   /** A result object from prepare flush cache stage */
502   @VisibleForTesting
503   static class PrepareFlushResult {
504     final FlushResult result; // indicating a failure result from prepare
505     final TreeMap<byte[], StoreFlushContext> storeFlushCtxs;
506     final TreeMap<byte[], List<Path>> committedFiles;
507     final TreeMap<byte[], Long> storeFlushableSize;
508     final long startTime;
509     final long flushOpSeqId;
510     final long flushedSeqId;
511     final long totalFlushableSize;
512
513     /** Constructs an early exit case */
514     PrepareFlushResult(FlushResult result, long flushSeqId) {
515       this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, 0);
516     }
517
518     /** Constructs a successful prepare flush result */
519     PrepareFlushResult(
520       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
521       TreeMap<byte[], List<Path>> committedFiles,
522       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
523       long flushedSeqId, long totalFlushableSize) {
524       this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
525         flushSeqId, flushedSeqId, totalFlushableSize);
526     }
527
528     private PrepareFlushResult(
529       FlushResult result,
530       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
531       TreeMap<byte[], List<Path>> committedFiles,
532       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
533       long flushedSeqId, long totalFlushableSize) {
534       this.result = result;
535       this.storeFlushCtxs = storeFlushCtxs;
536       this.committedFiles = committedFiles;
537       this.storeFlushableSize = storeFlushableSize;
538       this.startTime = startTime;
539       this.flushOpSeqId = flushSeqId;
540       this.flushedSeqId = flushedSeqId;
541       this.totalFlushableSize = totalFlushableSize;
542     }
543
544     public FlushResult getResult() {
545       return this.result;
546     }
547   }
548
549   final WriteState writestate = new WriteState();
550
551   long memstoreFlushSize;
552   final long timestampSlop;
553   final long rowProcessorTimeout;
554
555   // Last flush time for each Store. Useful when we are flushing for each column
556   private final ConcurrentMap<Store, Long> lastStoreFlushTimeMap =
557       new ConcurrentHashMap<Store, Long>();
558
559   final RegionServerServices rsServices;
560   private RegionServerAccounting rsAccounting;
561   private long flushCheckInterval;
562   // flushPerChanges is to prevent too many changes in memstore
563   private long flushPerChanges;
564   private long blockingMemStoreSize;
565   final long threadWakeFrequency;
566   // Used to guard closes
567   final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
568
569   // Stop updates lock
570   private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
571   private boolean splitRequest;
572   private byte[] explicitSplitPoint = null;
573
574   private final MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
575
576   // Coprocessor host
577   private RegionCoprocessorHost coprocessorHost;
578
579   private HTableDescriptor htableDescriptor = null;
580   private RegionSplitPolicy splitPolicy;
581   private FlushPolicy flushPolicy;
582
583   private final MetricsRegion metricsRegion;
584   private final MetricsRegionWrapperImpl metricsRegionWrapper;
585   private final Durability durability;
586   private final boolean regionStatsEnabled;
587   // Stores the replication scope of the various column families of the table
588   // that has non-default scope
589   private final NavigableMap<byte[], Integer> replicationScope = new TreeMap<byte[], Integer>(
590       Bytes.BYTES_COMPARATOR);
591
592   /**
593    * HRegion constructor. This constructor should only be used for testing and
594    * extensions.  Instances of HRegion should be instantiated with the
595    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
596    *
597    * @param tableDir qualified path of directory where region should be located,
598    * usually the table directory.
599    * @param wal The WAL is the outbound log for any updates to the HRegion
600    * The wal file is a logfile from the previous execution that's
601    * custom-computed for this HRegion. The HRegionServer computes and sorts the
602    * appropriate wal info for this HRegion. If there is a previous wal file
603    * (implying that the HRegion has been written-to before), then read it from
604    * the supplied path.
605    * @param fs is the filesystem.
606    * @param confParam is global configuration settings.
607    * @param regionInfo - HRegionInfo that describes the region
608    * is new), then read them from the supplied path.
609    * @param htd the table descriptor
610    * @param rsServices reference to {@link RegionServerServices} or null
611    * @deprecated Use other constructors.
612    */
613   @Deprecated
614   @VisibleForTesting
615   public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
616       final Configuration confParam, final HRegionInfo regionInfo,
617       final HTableDescriptor htd, final RegionServerServices rsServices) {
618     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
619       wal, confParam, htd, rsServices);
620   }
621
622   /**
623    * HRegion constructor. This constructor should only be used for testing and
624    * extensions.  Instances of HRegion should be instantiated with the
625    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
626    *
627    * @param fs is the filesystem.
628    * @param wal The WAL is the outbound log for any updates to the HRegion
629    * The wal file is a logfile from the previous execution that's
630    * custom-computed for this HRegion. The HRegionServer computes and sorts the
631    * appropriate wal info for this HRegion. If there is a previous wal file
632    * (implying that the HRegion has been written-to before), then read it from
633    * the supplied path.
634    * @param confParam is global configuration settings.
635    * @param htd the table descriptor
636    * @param rsServices reference to {@link RegionServerServices} or null
637    */
638   public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
639       final HTableDescriptor htd, final RegionServerServices rsServices) {
640     if (htd == null) {
641       throw new IllegalArgumentException("Need table descriptor");
642     }
643
644     if (confParam instanceof CompoundConfiguration) {
645       throw new IllegalArgumentException("Need original base configuration");
646     }
647
648     this.wal = wal;
649     this.fs = fs;
650
651     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
652     this.baseConf = confParam;
653     this.conf = new CompoundConfiguration()
654       .add(confParam)
655       .addStringMap(htd.getConfiguration())
656       .addBytesMap(htd.getValues());
657     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
658         DEFAULT_CACHE_FLUSH_INTERVAL);
659     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
660     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
661       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
662           + MAX_FLUSH_PER_CHANGES);
663     }
664     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
665                     DEFAULT_ROWLOCK_WAIT_DURATION);
666
667     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
668     this.htableDescriptor = htd;
669     Set<byte[]> families = this.htableDescriptor.getFamiliesKeys();
670     for (byte[] family : families) {
671       if (!replicationScope.containsKey(family)) {
672         int scope = htd.getFamily(family).getScope();
673         // Only store those families that has NON-DEFAULT scope
674         if (scope != REPLICATION_SCOPE_LOCAL) {
675           // Do a copy before storing it here.
676           replicationScope.put(Bytes.copy(family), scope);
677         }
678       }
679     }
680     this.rsServices = rsServices;
681     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
682     setHTableSpecificConf();
683     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
684
685     this.busyWaitDuration = conf.getLong(
686       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
687     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
688     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
689       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
690         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
691         + maxBusyWaitMultiplier + "). Their product should be positive");
692     }
693     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
694       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
695
696     /*
697      * timestamp.slop provides a server-side constraint on the timestamp. This
698      * assumes that you base your TS around currentTimeMillis(). In this case,
699      * throw an error to the user if the user-specified TS is newer than now +
700      * slop. LATEST_TIMESTAMP == don't use this functionality
701      */
702     this.timestampSlop = conf.getLong(
703         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
704         HConstants.LATEST_TIMESTAMP);
705
706     /**
707      * Timeout for the process time in processRowsWithLocks().
708      * Use -1 to switch off time bound.
709      */
710     this.rowProcessorTimeout = conf.getLong(
711         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
712     this.durability = htd.getDurability() == Durability.USE_DEFAULT
713         ? DEFAULT_DURABILITY
714         : htd.getDurability();
715     if (rsServices != null) {
716       this.rsAccounting = this.rsServices.getRegionServerAccounting();
717       // don't initialize coprocessors if not running within a regionserver
718       // TODO: revisit if coprocessors should load in other cases
719       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
720       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
721       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
722
723       Map<String, Region> recoveringRegions = rsServices.getRecoveringRegions();
724       String encodedName = getRegionInfo().getEncodedName();
725       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
726         this.recovering = true;
727         recoveringRegions.put(encodedName, this);
728       }
729     } else {
730       this.metricsRegionWrapper = null;
731       this.metricsRegion = null;
732     }
733     if (LOG.isDebugEnabled()) {
734       // Write out region name as string and its encoded name.
735       LOG.debug("Instantiated " + this);
736     }
737
738     // by default, we allow writes against a region when it's in recovering
739     this.disallowWritesInRecovering =
740         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
741           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
742     configurationManager = Optional.absent();
743
744     // disable stats tracking system tables, but check the config for everything else
745     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
746         NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
747           false :
748           conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
749               HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
750   }
751
752   void setHTableSpecificConf() {
753     if (this.htableDescriptor == null) return;
754     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
755
756     if (flushSize <= 0) {
757       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
758         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
759     }
760     this.memstoreFlushSize = flushSize;
761     this.blockingMemStoreSize = this.memstoreFlushSize *
762         conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
763                 HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
764   }
765
766   /**
767    * Initialize this region.
768    * Used only by tests and SplitTransaction to reopen the region.
769    * You should use createHRegion() or openHRegion()
770    * @return What the next sequence (edit) id should be.
771    * @throws IOException e
772    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
773    */
774   @Deprecated
775   public long initialize() throws IOException {
776     return initialize(null);
777   }
778
779   /**
780    * Initialize this region.
781    *
782    * @param reporter Tickle every so often if initialize is taking a while.
783    * @return What the next sequence (edit) id should be.
784    * @throws IOException e
785    */
786   private long initialize(final CancelableProgressable reporter) throws IOException {
787
788     //Refuse to open the region if there is no column family in the table
789     if (htableDescriptor.getColumnFamilies().length == 0) {
790       throw new DoNotRetryIOException("Table " + htableDescriptor.getNameAsString() +
791           " should have at least one column family.");
792     }
793
794     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
795     long nextSeqId = -1;
796     try {
797       nextSeqId = initializeRegionInternals(reporter, status);
798       return nextSeqId;
799     } finally {
800       // nextSeqid will be -1 if the initialization fails.
801       // At least it will be 0 otherwise.
802       if (nextSeqId == -1) {
803         status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() +
804           " initialization.");
805       }
806     }
807   }
808
809   private long initializeRegionInternals(final CancelableProgressable reporter,
810       final MonitoredTask status) throws IOException {
811     if (coprocessorHost != null) {
812       status.setStatus("Running coprocessor pre-open hook");
813       coprocessorHost.preOpen();
814     }
815
816     // Write HRI to a file in case we need to recover hbase:meta
817     status.setStatus("Writing region info on filesystem");
818     fs.checkRegionInfoOnFilesystem();
819
820     // Initialize all the HStores
821     status.setStatus("Initializing all the Stores");
822     long maxSeqId = initializeStores(reporter, status);
823     this.mvcc.advanceTo(maxSeqId);
824     if (ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) {
825       // Recover any edits if available.
826       maxSeqId = Math.max(maxSeqId,
827         replayRecoveredEditsIfAny(this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
828       // Make sure mvcc is up to max.
829       this.mvcc.advanceTo(maxSeqId);
830     }
831     this.lastReplayedOpenRegionSeqId = maxSeqId;
832
833     this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
834     this.writestate.flushRequested = false;
835     this.writestate.compacting.set(0);
836
837     if (this.writestate.writesEnabled) {
838       // Remove temporary data left over from old regions
839       status.setStatus("Cleaning up temporary data from old regions");
840       fs.cleanupTempDir();
841     }
842
843     if (this.writestate.writesEnabled) {
844       status.setStatus("Cleaning up detritus from prior splits");
845       // Get rid of any splits or merges that were lost in-progress.  Clean out
846       // these directories here on open.  We may be opening a region that was
847       // being split but we crashed in the middle of it all.
848       fs.cleanupAnySplitDetritus();
849       fs.cleanupMergesDir();
850     }
851
852     // Initialize split policy
853     this.splitPolicy = RegionSplitPolicy.create(this, conf);
854
855     // Initialize flush policy
856     this.flushPolicy = FlushPolicyFactory.create(this, conf);
857
858     long lastFlushTime = EnvironmentEdgeManager.currentTime();
859     for (Store store: stores.values()) {
860       this.lastStoreFlushTimeMap.put(store, lastFlushTime);
861     }
862
863     // Use maximum of log sequenceid or that which was found in stores
864     // (particularly if no recovered edits, seqid will be -1).
865     long nextSeqid = maxSeqId;
866
867     // In distributedLogReplay mode, we don't know the last change sequence number because region
868     // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
869     // overlaps used sequence numbers
870     if (this.writestate.writesEnabled) {
871       nextSeqid = WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs
872           .getRegionDir(), nextSeqid, (this.recovering ? (this.flushPerChanges + 10000000) : 1));
873     } else {
874       nextSeqid++;
875     }
876
877     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
878       "; next sequenceid=" + nextSeqid);
879
880     // A region can be reopened if failed a split; reset flags
881     this.closing.set(false);
882     this.closed.set(false);
883
884     if (coprocessorHost != null) {
885       status.setStatus("Running coprocessor post-open hooks");
886       coprocessorHost.postOpen();
887     }
888
889     status.markComplete("Region opened successfully");
890     return nextSeqid;
891   }
892
893   /**
894    * Open all Stores.
895    * @param reporter
896    * @param status
897    * @return Highest sequenceId found out in a Store.
898    * @throws IOException
899    */
900   private long initializeStores(final CancelableProgressable reporter, MonitoredTask status)
901   throws IOException {
902     // Load in all the HStores.
903
904     long maxSeqId = -1;
905     // initialized to -1 so that we pick up MemstoreTS from column families
906     long maxMemstoreTS = -1;
907
908     if (!htableDescriptor.getFamilies().isEmpty()) {
909       // initialize the thread pool for opening stores in parallel.
910       ThreadPoolExecutor storeOpenerThreadPool =
911         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
912       CompletionService<HStore> completionService =
913         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
914
915       // initialize each store in parallel
916       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
917         status.setStatus("Instantiating store for column family " + family);
918         completionService.submit(new Callable<HStore>() {
919           @Override
920           public HStore call() throws IOException {
921             return instantiateHStore(family);
922           }
923         });
924       }
925       boolean allStoresOpened = false;
926       boolean hasSloppyStores = false;
927       try {
928         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
929           Future<HStore> future = completionService.take();
930           HStore store = future.get();
931           this.stores.put(store.getFamily().getName(), store);
932           MemStore memStore = store.getMemStore();
933           if(memStore != null && memStore.isSloppy()) {
934             hasSloppyStores = true;
935           }
936
937           long storeMaxSequenceId = store.getMaxSequenceId();
938           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
939               storeMaxSequenceId);
940           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
941             maxSeqId = storeMaxSequenceId;
942           }
943           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
944           if (maxStoreMemstoreTS > maxMemstoreTS) {
945             maxMemstoreTS = maxStoreMemstoreTS;
946           }
947         }
948         allStoresOpened = true;
949         if(hasSloppyStores) {
950           htableDescriptor.setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class
951               .getName());
952           LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this);
953         }
954       } catch (InterruptedException e) {
955         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
956       } catch (ExecutionException e) {
957         throw new IOException(e.getCause());
958       } finally {
959         storeOpenerThreadPool.shutdownNow();
960         if (!allStoresOpened) {
961           // something went wrong, close all opened stores
962           LOG.error("Could not initialize all stores for the region=" + this);
963           for (Store store : this.stores.values()) {
964             try {
965               store.close();
966             } catch (IOException e) {
967               LOG.warn(e.getMessage());
968             }
969           }
970         }
971       }
972     }
973     return Math.max(maxSeqId, maxMemstoreTS + 1);
974   }
975
976   private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
977     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
978     // Initialize all the HStores
979     status.setStatus("Warming up all the Stores");
980     try {
981       initializeStores(reporter, status);
982     } finally {
983       status.markComplete("Done warming up.");
984     }
985   }
986
987   /**
988    * @return Map of StoreFiles by column family
989    */
990   private NavigableMap<byte[], List<Path>> getStoreFiles() {
991     NavigableMap<byte[], List<Path>> allStoreFiles =
992       new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
993     for (Store store: getStores()) {
994       Collection<StoreFile> storeFiles = store.getStorefiles();
995       if (storeFiles == null) continue;
996       List<Path> storeFileNames = new ArrayList<Path>();
997       for (StoreFile storeFile: storeFiles) {
998         storeFileNames.add(storeFile.getPath());
999       }
1000       allStoreFiles.put(store.getFamily().getName(), storeFileNames);
1001     }
1002     return allStoreFiles;
1003   }
1004
1005   private void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
1006     Map<byte[], List<Path>> storeFiles = getStoreFiles();
1007     RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
1008       RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
1009       getRegionServerServices().getServerName(), storeFiles);
1010     WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc,
1011         mvcc);
1012   }
1013
1014   private void writeRegionCloseMarker(WAL wal) throws IOException {
1015     Map<byte[], List<Path>> storeFiles = getStoreFiles();
1016     RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
1017       RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(),
1018       getRegionServerServices().getServerName(), storeFiles);
1019     WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc,
1020         mvcc);
1021
1022     // Store SeqId in HDFS when a region closes
1023     // checking region folder exists is due to many tests which delete the table folder while a
1024     // table is still online
1025     if (this.fs.getFileSystem().exists(this.fs.getRegionDir())) {
1026       WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs.getRegionDir(),
1027         mvcc.getReadPoint(), 0);
1028     }
1029   }
1030
1031   /**
1032    * @return True if this region has references.
1033    */
1034   public boolean hasReferences() {
1035     for (Store store : this.stores.values()) {
1036       if (store.hasReferences()) return true;
1037     }
1038     return false;
1039   }
1040
1041   public void blockUpdates() {
1042     this.updatesLock.writeLock().lock();
1043   }
1044
1045   public void unblockUpdates() {
1046     this.updatesLock.writeLock().unlock();
1047   }
1048
1049   @Override
1050   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
1051     HDFSBlocksDistribution hdfsBlocksDistribution =
1052       new HDFSBlocksDistribution();
1053     synchronized (this.stores) {
1054       for (Store store : this.stores.values()) {
1055         Collection<StoreFile> storeFiles = store.getStorefiles();
1056         if (storeFiles == null) continue;
1057         for (StoreFile sf : storeFiles) {
1058           HDFSBlocksDistribution storeFileBlocksDistribution =
1059             sf.getHDFSBlockDistribution();
1060           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
1061         }
1062       }
1063     }
1064     return hdfsBlocksDistribution;
1065   }
1066
1067   /**
1068    * This is a helper function to compute HDFS block distribution on demand
1069    * @param conf configuration
1070    * @param tableDescriptor HTableDescriptor of the table
1071    * @param regionInfo encoded name of the region
1072    * @return The HDFS blocks distribution for the given region.
1073    * @throws IOException
1074    */
1075   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1076       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
1077     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
1078     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
1079   }
1080
1081   /**
1082    * This is a helper function to compute HDFS block distribution on demand
1083    * @param conf configuration
1084    * @param tableDescriptor HTableDescriptor of the table
1085    * @param regionInfo encoded name of the region
1086    * @param tablePath the table directory
1087    * @return The HDFS blocks distribution for the given region.
1088    * @throws IOException
1089    */
1090   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1091       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
1092       throws IOException {
1093     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1094     FileSystem fs = tablePath.getFileSystem(conf);
1095
1096     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
1097     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
1098       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
1099       if (storeFiles == null) continue;
1100       for (StoreFileInfo storeFileInfo : storeFiles) {
1101         try {
1102           hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
1103         } catch (IOException ioe) {
1104           LOG.warn("Error getting hdfs block distribution for " + storeFileInfo);
1105         }
1106       }
1107     }
1108     return hdfsBlocksDistribution;
1109   }
1110
1111   /**
1112    * Increase the size of mem store in this region and the size of global mem
1113    * store
1114    * @return the size of memstore in this region
1115    */
1116   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
1117     if (this.rsAccounting != null) {
1118       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
1119     }
1120     long size = this.memstoreSize.addAndGet(memStoreSize);
1121     // This is extremely bad if we make memstoreSize negative. Log as much info on the offending
1122     // caller as possible. (memStoreSize might be a negative value already -- freeing memory)
1123     if (size < 0) {
1124       LOG.error("Asked to modify this region's (" + this.toString()
1125       + ") memstoreSize to a negative value which is incorrect. Current memstoreSize="
1126       + (size-memStoreSize) + ", delta=" + memStoreSize, new Exception());
1127     }
1128     return size;
1129   }
1130
1131   @Override
1132   public HRegionInfo getRegionInfo() {
1133     return this.fs.getRegionInfo();
1134   }
1135
1136   /**
1137    * @return Instance of {@link RegionServerServices} used by this HRegion.
1138    * Can be null.
1139    */
1140   RegionServerServices getRegionServerServices() {
1141     return this.rsServices;
1142   }
1143
1144   @Override
1145   public long getReadRequestsCount() {
1146     return readRequestsCount.get();
1147   }
1148
1149   @Override
1150   public void updateReadRequestsCount(long i) {
1151     readRequestsCount.add(i);
1152   }
1153
1154   @Override
1155   public long getFilteredReadRequestsCount() {
1156     return filteredReadRequestsCount.get();
1157   }
1158
1159   @Override
1160   public long getWriteRequestsCount() {
1161     return writeRequestsCount.get();
1162   }
1163
1164   @Override
1165   public void updateWriteRequestsCount(long i) {
1166     writeRequestsCount.add(i);
1167   }
1168
1169   @Override
1170   public long getMemstoreSize() {
1171     return memstoreSize.get();
1172   }
1173
1174   @Override
1175   public RegionServicesForStores getRegionServicesForStores() {
1176     return regionServicesForStores;
1177   }
1178
1179   @Override
1180   public long getNumMutationsWithoutWAL() {
1181     return numMutationsWithoutWAL.get();
1182   }
1183
1184   @Override
1185   public long getDataInMemoryWithoutWAL() {
1186     return dataInMemoryWithoutWAL.get();
1187   }
1188
1189   @Override
1190   public long getBlockedRequestsCount() {
1191     return blockedRequestsCount.get();
1192   }
1193
1194   @Override
1195   public long getCheckAndMutateChecksPassed() {
1196     return checkAndMutateChecksPassed.get();
1197   }
1198
1199   @Override
1200   public long getCheckAndMutateChecksFailed() {
1201     return checkAndMutateChecksFailed.get();
1202   }
1203
1204   @Override
1205   public MetricsRegion getMetrics() {
1206     return metricsRegion;
1207   }
1208
1209   @Override
1210   public boolean isClosed() {
1211     return this.closed.get();
1212   }
1213
1214   @Override
1215   public boolean isClosing() {
1216     return this.closing.get();
1217   }
1218
1219   @Override
1220   public boolean isReadOnly() {
1221     return this.writestate.isReadOnly();
1222   }
1223
1224   /**
1225    * Reset recovering state of current region
1226    */
1227   public void setRecovering(boolean newState) {
1228     boolean wasRecovering = this.recovering;
1229     // Before we flip the recovering switch (enabling reads) we should write the region open
1230     // event to WAL if needed
1231     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
1232         && wasRecovering && !newState) {
1233
1234       // force a flush only if region replication is set up for this region. Otherwise no need.
1235       boolean forceFlush = getTableDesc().getRegionReplication() > 1;
1236
1237       MonitoredTask status = TaskMonitor.get().createStatus("Recovering region " + this);
1238
1239       try {
1240         // force a flush first
1241         if (forceFlush) {
1242           status.setStatus("Flushing region " + this + " because recovery is finished");
1243           internalFlushcache(status);
1244         }
1245
1246         status.setStatus("Writing region open event marker to WAL because recovery is finished");
1247         try {
1248           long seqId = openSeqNum;
1249           // obtain a new seqId because we possibly have writes and flushes on top of openSeqNum
1250           if (wal != null) {
1251             seqId = getNextSequenceId(wal);
1252           }
1253           writeRegionOpenMarker(wal, seqId);
1254         } catch (IOException e) {
1255           // We cannot rethrow this exception since we are being called from the zk thread. The
1256           // region has already opened. In this case we log the error, but continue
1257           LOG.warn(getRegionInfo().getEncodedName() + " : was not able to write region opening "
1258               + "event to WAL, continuing", e);
1259         }
1260       } catch (IOException ioe) {
1261         // Distributed log replay semantics does not necessarily require a flush, since the replayed
1262         // data is already written again in the WAL. So failed flush should be fine.
1263         LOG.warn(getRegionInfo().getEncodedName() + " : was not able to flush "
1264             + "event to WAL, continuing", ioe);
1265       } finally {
1266         status.cleanup();
1267       }
1268     }
1269
1270     this.recovering = newState;
1271     if (wasRecovering && !recovering) {
1272       // Call only when wal replay is over.
1273       coprocessorHost.postLogReplay();
1274     }
1275   }
1276
1277   @Override
1278   public boolean isRecovering() {
1279     return this.recovering;
1280   }
1281
1282   @Override
1283   public boolean isAvailable() {
1284     return !isClosed() && !isClosing();
1285   }
1286
1287   /** @return true if region is splittable */
1288   public boolean isSplittable() {
1289     return isAvailable() && !hasReferences();
1290   }
1291
1292   /**
1293    * @return true if region is mergeable
1294    */
1295   public boolean isMergeable() {
1296     if (!isAvailable()) {
1297       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1298           + " is not mergeable because it is closing or closed");
1299       return false;
1300     }
1301     if (hasReferences()) {
1302       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1303           + " is not mergeable because it has references");
1304       return false;
1305     }
1306
1307     return true;
1308   }
1309
1310   public boolean areWritesEnabled() {
1311     synchronized(this.writestate) {
1312       return this.writestate.writesEnabled;
1313     }
1314   }
1315
1316   @VisibleForTesting
1317   public MultiVersionConcurrencyControl getMVCC() {
1318     return mvcc;
1319   }
1320
1321   @Override
1322   public long getMaxFlushedSeqId() {
1323     return maxFlushedSeqId;
1324   }
1325
1326   @Override
1327   public long getReadPoint(IsolationLevel isolationLevel) {
1328     if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1329       // This scan can read even uncommitted transactions
1330       return Long.MAX_VALUE;
1331     }
1332     return mvcc.getReadPoint();
1333   }
1334
1335   @Override
1336   public long getReadpoint(IsolationLevel isolationLevel) {
1337     return getReadPoint(isolationLevel);
1338   }
1339
1340   @Override
1341   public boolean isLoadingCfsOnDemandDefault() {
1342     return this.isLoadingCfsOnDemandDefault;
1343   }
1344
1345   /**
1346    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1347    * service any more calls.
1348    *
1349    * <p>This method could take some time to execute, so don't call it from a
1350    * time-sensitive thread.
1351    *
1352    * @return Vector of all the storage files that the HRegion's component
1353    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1354    * vector if already closed and null if judged that it should not close.
1355    *
1356    * @throws IOException e
1357    * @throws DroppedSnapshotException Thrown when replay of wal is required
1358    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1359    * caller MUST abort after this.
1360    */
1361   public Map<byte[], List<StoreFile>> close() throws IOException {
1362     return close(false);
1363   }
1364
1365   private final Object closeLock = new Object();
1366
1367   /** Conf key for the periodic flush interval */
1368   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1369       "hbase.regionserver.optionalcacheflushinterval";
1370   /** Default interval for the memstore flush */
1371   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1372   /** Default interval for System tables memstore flush */
1373   public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
1374
1375   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1376   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1377       "hbase.regionserver.flush.per.changes";
1378   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1379   /**
1380    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1381    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1382    */
1383   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1384
1385   /**
1386    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1387    * Shut down each HStore, don't service any more calls.
1388    *
1389    * This method could take some time to execute, so don't call it from a
1390    * time-sensitive thread.
1391    *
1392    * @param abort true if server is aborting (only during testing)
1393    * @return Vector of all the storage files that the HRegion's component
1394    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1395    * we are not to close at this time or we are already closed.
1396    *
1397    * @throws IOException e
1398    * @throws DroppedSnapshotException Thrown when replay of wal is required
1399    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1400    * caller MUST abort after this.
1401    */
1402   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1403     // Only allow one thread to close at a time. Serialize them so dual
1404     // threads attempting to close will run up against each other.
1405     MonitoredTask status = TaskMonitor.get().createStatus(
1406         "Closing region " + this +
1407         (abort ? " due to abort" : ""));
1408
1409     status.setStatus("Waiting for close lock");
1410     try {
1411       synchronized (closeLock) {
1412         return doClose(abort, status);
1413       }
1414     } finally {
1415       status.cleanup();
1416     }
1417   }
1418
1419   /**
1420    * Exposed for some very specific unit tests.
1421    */
1422   @VisibleForTesting
1423   public void setClosing(boolean closing) {
1424     this.closing.set(closing);
1425   }
1426
1427   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK_EXCEPTION_PATH",
1428       justification="I think FindBugs is confused")
1429   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1430       throws IOException {
1431     if (isClosed()) {
1432       LOG.warn("Region " + this + " already closed");
1433       return null;
1434     }
1435
1436     if (coprocessorHost != null) {
1437       status.setStatus("Running coprocessor pre-close hooks");
1438       this.coprocessorHost.preClose(abort);
1439     }
1440
1441     status.setStatus("Disabling compacts and flushes for region");
1442     boolean canFlush = true;
1443     synchronized (writestate) {
1444       // Disable compacting and flushing by background threads for this
1445       // region.
1446       canFlush = !writestate.readOnly;
1447       writestate.writesEnabled = false;
1448       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1449       waitForFlushesAndCompactions();
1450     }
1451     // If we were not just flushing, is it worth doing a preflush...one
1452     // that will clear out of the bulk of the memstore before we put up
1453     // the close flag?
1454     if (!abort && worthPreFlushing() && canFlush) {
1455       status.setStatus("Pre-flushing region before close");
1456       LOG.info("Running close preflush of " + getRegionInfo().getRegionNameAsString());
1457       try {
1458         internalFlushcache(status);
1459       } catch (IOException ioe) {
1460         // Failed to flush the region. Keep going.
1461         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1462       }
1463     }
1464
1465     // block waiting for the lock for closing
1466     lock.writeLock().lock(); // FindBugs: Complains UL_UNRELEASED_LOCK_EXCEPTION_PATH but seems fine
1467     this.closing.set(true);
1468     status.setStatus("Disabling writes for close");
1469     try {
1470       if (this.isClosed()) {
1471         status.abort("Already got closed by another process");
1472         // SplitTransaction handles the null
1473         return null;
1474       }
1475       LOG.debug("Updates disabled for region " + this);
1476       // Don't flush the cache if we are aborting
1477       if (!abort && canFlush) {
1478         int failedfFlushCount = 0;
1479         int flushCount = 0;
1480         long tmp = 0;
1481         long remainingSize = this.memstoreSize.get();
1482         while (remainingSize > 0) {
1483           try {
1484             internalFlushcache(status);
1485             if(flushCount >0) {
1486               LOG.info("Running extra flush, " + flushCount +
1487                   " (carrying snapshot?) " + this);
1488             }
1489             flushCount++;
1490             tmp = this.memstoreSize.get();
1491             if (tmp >= remainingSize) {
1492               failedfFlushCount++;
1493             }
1494             remainingSize = tmp;
1495             if (failedfFlushCount > 5) {
1496               // If we failed 5 times and are unable to clear memory, abort
1497               // so we do not lose data
1498               throw new DroppedSnapshotException("Failed clearing memory after " +
1499                   flushCount + " attempts on region: " +
1500                   Bytes.toStringBinary(getRegionInfo().getRegionName()));
1501             }
1502           } catch (IOException ioe) {
1503             status.setStatus("Failed flush " + this + ", putting online again");
1504             synchronized (writestate) {
1505               writestate.writesEnabled = true;
1506             }
1507             // Have to throw to upper layers.  I can't abort server from here.
1508             throw ioe;
1509           }
1510         }
1511       }
1512
1513       Map<byte[], List<StoreFile>> result =
1514         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1515       if (!stores.isEmpty()) {
1516         // initialize the thread pool for closing stores in parallel.
1517         ThreadPoolExecutor storeCloserThreadPool =
1518           getStoreOpenAndCloseThreadPool("StoreCloserThread-" +
1519             getRegionInfo().getRegionNameAsString());
1520         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1521           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1522
1523         // close each store in parallel
1524         for (final Store store : stores.values()) {
1525           long flushableSize = store.getFlushableSize();
1526           if (!(abort || flushableSize == 0 || writestate.readOnly)) {
1527             if (getRegionServerServices() != null) {
1528               getRegionServerServices().abort("Assertion failed while closing store "
1529                 + getRegionInfo().getRegionNameAsString() + " " + store
1530                 + ". flushableSize expected=0, actual= " + flushableSize
1531                 + ". Current memstoreSize=" + getMemstoreSize() + ". Maybe a coprocessor "
1532                 + "operation failed and left the memstore in a partially updated state.", null);
1533             }
1534           }
1535           completionService
1536               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1537                 @Override
1538                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1539                   return new Pair<byte[], Collection<StoreFile>>(
1540                     store.getFamily().getName(), store.close());
1541                 }
1542               });
1543         }
1544         try {
1545           for (int i = 0; i < stores.size(); i++) {
1546             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1547             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1548             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1549             if (familyFiles == null) {
1550               familyFiles = new ArrayList<StoreFile>();
1551               result.put(storeFiles.getFirst(), familyFiles);
1552             }
1553             familyFiles.addAll(storeFiles.getSecond());
1554           }
1555         } catch (InterruptedException e) {
1556           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1557         } catch (ExecutionException e) {
1558           throw new IOException(e.getCause());
1559         } finally {
1560           storeCloserThreadPool.shutdownNow();
1561         }
1562       }
1563
1564       status.setStatus("Writing region close event to WAL");
1565       if (!abort && wal != null && getRegionServerServices() != null && !writestate.readOnly) {
1566         writeRegionCloseMarker(wal);
1567       }
1568
1569       this.closed.set(true);
1570       if (!canFlush) {
1571         addAndGetGlobalMemstoreSize(-memstoreSize.get());
1572       } else if (memstoreSize.get() != 0) {
1573         LOG.error("Memstore size is " + memstoreSize.get());
1574       }
1575       if (coprocessorHost != null) {
1576         status.setStatus("Running coprocessor post-close hooks");
1577         this.coprocessorHost.postClose(abort);
1578       }
1579       if (this.metricsRegion != null) {
1580         this.metricsRegion.close();
1581       }
1582       if (this.metricsRegionWrapper != null) {
1583         Closeables.closeQuietly(this.metricsRegionWrapper);
1584       }
1585       // stop the Compacted hfile discharger
1586       if (this.compactedFileDischarger != null) this.compactedFileDischarger.cancel(true);
1587
1588       status.markComplete("Closed");
1589       LOG.info("Closed " + this);
1590       return result;
1591     } finally {
1592       lock.writeLock().unlock();
1593     }
1594   }
1595
1596   @Override
1597   public void waitForFlushesAndCompactions() {
1598     synchronized (writestate) {
1599       if (this.writestate.readOnly) {
1600         // we should not wait for replayed flushed if we are read only (for example in case the
1601         // region is a secondary replica).
1602         return;
1603       }
1604       boolean interrupted = false;
1605       try {
1606         while (writestate.compacting.get() > 0 || writestate.flushing) {
1607           LOG.debug("waiting for " + writestate.compacting + " compactions"
1608             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1609           try {
1610             writestate.wait();
1611           } catch (InterruptedException iex) {
1612             // essentially ignore and propagate the interrupt back up
1613             LOG.warn("Interrupted while waiting");
1614             interrupted = true;
1615           }
1616         }
1617       } finally {
1618         if (interrupted) {
1619           Thread.currentThread().interrupt();
1620         }
1621       }
1622     }
1623   }
1624
1625   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1626       final String threadNamePrefix) {
1627     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1628     int maxThreads = Math.min(numStores,
1629         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1630             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1631     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1632   }
1633
1634   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1635       final String threadNamePrefix) {
1636     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1637     int maxThreads = Math.max(1,
1638         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1639             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1640             / numStores);
1641     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1642   }
1643
1644   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1645       final String threadNamePrefix) {
1646     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1647       new ThreadFactory() {
1648         private int count = 1;
1649
1650         @Override
1651         public Thread newThread(Runnable r) {
1652           return new Thread(r, threadNamePrefix + "-" + count++);
1653         }
1654       });
1655   }
1656
1657    /**
1658     * @return True if its worth doing a flush before we put up the close flag.
1659     */
1660   private boolean worthPreFlushing() {
1661     return this.memstoreSize.get() >
1662       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1663   }
1664
1665   //////////////////////////////////////////////////////////////////////////////
1666   // HRegion accessors
1667   //////////////////////////////////////////////////////////////////////////////
1668
1669   @Override
1670   public HTableDescriptor getTableDesc() {
1671     return this.htableDescriptor;
1672   }
1673
1674   /** @return WAL in use for this region */
1675   public WAL getWAL() {
1676     return this.wal;
1677   }
1678
1679   /**
1680    * @return split policy for this region.
1681    */
1682   public RegionSplitPolicy getSplitPolicy() {
1683     return this.splitPolicy;
1684   }
1685
1686   /**
1687    * A split takes the config from the parent region & passes it to the daughter
1688    * region's constructor. If 'conf' was passed, you would end up using the HTD
1689    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1690    * to the daughter regions to avoid this tricky dedupe problem.
1691    * @return Configuration object
1692    */
1693   Configuration getBaseConf() {
1694     return this.baseConf;
1695   }
1696
1697   /** @return {@link FileSystem} being used by this region */
1698   public FileSystem getFilesystem() {
1699     return fs.getFileSystem();
1700   }
1701
1702   /** @return the {@link HRegionFileSystem} used by this region */
1703   public HRegionFileSystem getRegionFileSystem() {
1704     return this.fs;
1705   }
1706
1707   @Override
1708   public long getEarliestFlushTimeForAllStores() {
1709     return Collections.min(lastStoreFlushTimeMap.values());
1710   }
1711
1712   @Override
1713   public long getOldestHfileTs(boolean majorCompactioOnly) throws IOException {
1714     long result = Long.MAX_VALUE;
1715     for (Store store : getStores()) {
1716       Collection<StoreFile> storeFiles = store.getStorefiles();
1717       if (storeFiles == null) continue;
1718       for (StoreFile file : storeFiles) {
1719         StoreFileReader sfReader = file.getReader();
1720         if (sfReader == null) continue;
1721         HFile.Reader reader = sfReader.getHFileReader();
1722         if (reader == null) continue;
1723         if (majorCompactioOnly) {
1724           byte[] val = reader.loadFileInfo().get(StoreFile.MAJOR_COMPACTION_KEY);
1725           if (val == null) continue;
1726           if (val == null || !Bytes.toBoolean(val)) {
1727             continue;
1728           }
1729         }
1730         result = Math.min(result, reader.getFileContext().getFileCreateTime());
1731       }
1732     }
1733     return result == Long.MAX_VALUE ? 0 : result;
1734   }
1735
1736   RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
1737     long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
1738     byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
1739     regionLoadBldr.clearStoreCompleteSequenceId();
1740     for (byte[] familyName : this.stores.keySet()) {
1741       long earliest = this.wal.getEarliestMemstoreSeqNum(encodedRegionName, familyName);
1742       // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will
1743       // give us a sequence id that is for sure flushed. We want edit replay to start after this
1744       // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id.
1745       long csid = (earliest == HConstants.NO_SEQNUM)? lastFlushOpSeqIdLocal: earliest - 1;
1746       regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.
1747         newBuilder().setFamilyName(ByteString.copyFrom(familyName)).setSequenceId(csid).build());
1748     }
1749     return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId());
1750   }
1751
1752   //////////////////////////////////////////////////////////////////////////////
1753   // HRegion maintenance.
1754   //
1755   // These methods are meant to be called periodically by the HRegionServer for
1756   // upkeep.
1757   //////////////////////////////////////////////////////////////////////////////
1758
1759   /** @return returns size of largest HStore. */
1760   public long getLargestHStoreSize() {
1761     long size = 0;
1762     for (Store h : stores.values()) {
1763       long storeSize = h.getSize();
1764       if (storeSize > size) {
1765         size = storeSize;
1766       }
1767     }
1768     return size;
1769   }
1770
1771   /*
1772    * Do preparation for pending compaction.
1773    * @throws IOException
1774    */
1775   protected void doRegionCompactionPrep() throws IOException {
1776   }
1777
1778   @Override
1779   public void triggerMajorCompaction() throws IOException {
1780     for (Store s : getStores()) {
1781       s.triggerMajorCompaction();
1782     }
1783   }
1784
1785   @Override
1786   public void compact(final boolean majorCompaction) throws IOException {
1787     if (majorCompaction) {
1788       triggerMajorCompaction();
1789     }
1790     for (Store s : getStores()) {
1791       CompactionContext compaction = s.requestCompaction();
1792       if (compaction != null) {
1793         ThroughputController controller = null;
1794         if (rsServices != null) {
1795           controller = CompactionThroughputControllerFactory.create(rsServices, conf);
1796         }
1797         if (controller == null) {
1798           controller = NoLimitThroughputController.INSTANCE;
1799         }
1800         compact(compaction, s, controller, null);
1801       }
1802     }
1803   }
1804
1805   /**
1806    * This is a helper function that compact all the stores synchronously
1807    * It is used by utilities and testing
1808    *
1809    * @throws IOException e
1810    */
1811   public void compactStores() throws IOException {
1812     for (Store s : getStores()) {
1813       CompactionContext compaction = s.requestCompaction();
1814       if (compaction != null) {
1815         compact(compaction, s, NoLimitThroughputController.INSTANCE, null);
1816       }
1817     }
1818   }
1819
1820   /**
1821    * This is a helper function that compact the given store
1822    * It is used by utilities and testing
1823    *
1824    * @throws IOException e
1825    */
1826   @VisibleForTesting
1827   void compactStore(byte[] family, ThroughputController throughputController)
1828       throws IOException {
1829     Store s = getStore(family);
1830     CompactionContext compaction = s.requestCompaction();
1831     if (compaction != null) {
1832       compact(compaction, s, throughputController, null);
1833     }
1834   }
1835
1836   /*
1837    * Called by compaction thread and after region is opened to compact the
1838    * HStores if necessary.
1839    *
1840    * <p>This operation could block for a long time, so don't call it from a
1841    * time-sensitive thread.
1842    *
1843    * Note that no locking is necessary at this level because compaction only
1844    * conflicts with a region split, and that cannot happen because the region
1845    * server does them sequentially and not in parallel.
1846    *
1847    * @param compaction Compaction details, obtained by requestCompaction()
1848    * @param throughputController
1849    * @return whether the compaction completed
1850    */
1851   public boolean compact(CompactionContext compaction, Store store,
1852       ThroughputController throughputController) throws IOException {
1853     return compact(compaction, store, throughputController, null);
1854   }
1855
1856   public boolean compact(CompactionContext compaction, Store store,
1857       ThroughputController throughputController, User user) throws IOException {
1858     assert compaction != null && compaction.hasSelection();
1859     assert !compaction.getRequest().getFiles().isEmpty();
1860     if (this.closing.get() || this.closed.get()) {
1861       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1862       store.cancelRequestedCompaction(compaction);
1863       return false;
1864     }
1865     MonitoredTask status = null;
1866     boolean requestNeedsCancellation = true;
1867     /*
1868      * We are trying to remove / relax the region read lock for compaction.
1869      * Let's see what are the potential race conditions among the operations (user scan,
1870      * region split, region close and region bulk load).
1871      *
1872      *  user scan ---> region read lock
1873      *  region split --> region close first --> region write lock
1874      *  region close --> region write lock
1875      *  region bulk load --> region write lock
1876      *
1877      * read lock is compatible with read lock. ---> no problem with user scan/read
1878      * region bulk load does not cause problem for compaction (no consistency problem, store lock
1879      *  will help the store file accounting).
1880      * They can run almost concurrently at the region level.
1881      *
1882      * The only remaining race condition is between the region close and compaction.
1883      * So we will evaluate, below, how region close intervenes with compaction if compaction does
1884      * not acquire region read lock.
1885      *
1886      * Here are the steps for compaction:
1887      * 1. obtain list of StoreFile's
1888      * 2. create StoreFileScanner's based on list from #1
1889      * 3. perform compaction and save resulting files under tmp dir
1890      * 4. swap in compacted files
1891      *
1892      * #1 is guarded by store lock. This patch does not change this --> no worse or better
1893      * For #2, we obtain smallest read point (for region) across all the Scanners (for both default
1894      * compactor and stripe compactor).
1895      * The read points are for user scans. Region keeps the read points for all currently open
1896      * user scanners.
1897      * Compaction needs to know the smallest read point so that during re-write of the hfiles,
1898      * it can remove the mvcc points for the cells if their mvccs are older than the smallest
1899      * since they are not needed anymore.
1900      * This will not conflict with compaction.
1901      * For #3, it can be performed in parallel to other operations.
1902      * For #4 bulk load and compaction don't conflict with each other on the region level
1903      *   (for multi-family atomicy).
1904      * Region close and compaction are guarded pretty well by the 'writestate'.
1905      * In HRegion#doClose(), we have :
1906      * synchronized (writestate) {
1907      *   // Disable compacting and flushing by background threads for this
1908      *   // region.
1909      *   canFlush = !writestate.readOnly;
1910      *   writestate.writesEnabled = false;
1911      *   LOG.debug("Closing " + this + ": disabling compactions & flushes");
1912      *   waitForFlushesAndCompactions();
1913      * }
1914      * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0.
1915      * and in HRegion.compact()
1916      *  try {
1917      *    synchronized (writestate) {
1918      *    if (writestate.writesEnabled) {
1919      *      wasStateSet = true;
1920      *      ++writestate.compacting;
1921      *    } else {
1922      *      String msg = "NOT compacting region " + this + ". Writes disabled.";
1923      *      LOG.info(msg);
1924      *      status.abort(msg);
1925      *      return false;
1926      *    }
1927      *  }
1928      * Also in compactor.performCompaction():
1929      * check periodically to see if a system stop is requested
1930      * if (closeCheckInterval > 0) {
1931      *   bytesWritten += len;
1932      *   if (bytesWritten > closeCheckInterval) {
1933      *     bytesWritten = 0;
1934      *     if (!store.areWritesEnabled()) {
1935      *       progress.cancel();
1936      *       return false;
1937      *     }
1938      *   }
1939      * }
1940      */
1941     try {
1942       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1943       if (stores.get(cf) != store) {
1944         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1945             + " has been re-instantiated, cancel this compaction request. "
1946             + " It may be caused by the roll back of split transaction");
1947         return false;
1948       }
1949
1950       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1951       if (this.closed.get()) {
1952         String msg = "Skipping compaction on " + this + " because closed";
1953         LOG.debug(msg);
1954         status.abort(msg);
1955         return false;
1956       }
1957       boolean wasStateSet = false;
1958       try {
1959         synchronized (writestate) {
1960           if (writestate.writesEnabled) {
1961             wasStateSet = true;
1962             writestate.compacting.incrementAndGet();
1963           } else {
1964             String msg = "NOT compacting region " + this + ". Writes disabled.";
1965             LOG.info(msg);
1966             status.abort(msg);
1967             return false;
1968           }
1969         }
1970         LOG.info("Starting compaction on " + store + " in region " + this
1971             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1972         doRegionCompactionPrep();
1973         try {
1974           status.setStatus("Compacting store " + store);
1975           // We no longer need to cancel the request on the way out of this
1976           // method because Store#compact will clean up unconditionally
1977           requestNeedsCancellation = false;
1978           store.compact(compaction, throughputController, user);
1979         } catch (InterruptedIOException iioe) {
1980           String msg = "compaction interrupted";
1981           LOG.info(msg, iioe);
1982           status.abort(msg);
1983           return false;
1984         }
1985       } finally {
1986         if (wasStateSet) {
1987           synchronized (writestate) {
1988             writestate.compacting.decrementAndGet();
1989             if (writestate.compacting.get() <= 0) {
1990               writestate.notifyAll();
1991             }
1992           }
1993         }
1994       }
1995       status.markComplete("Compaction complete");
1996       return true;
1997     } finally {
1998       if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
1999       if (status != null) status.cleanup();
2000     }
2001   }
2002
2003   @Override
2004   public FlushResult flush(boolean force) throws IOException {
2005     return flushcache(force, false);
2006   }
2007
2008   /**
2009    * Flush the cache.
2010    *
2011    * When this method is called the cache will be flushed unless:
2012    * <ol>
2013    *   <li>the cache is empty</li>
2014    *   <li>the region is closed.</li>
2015    *   <li>a flush is already in progress</li>
2016    *   <li>writes are disabled</li>
2017    * </ol>
2018    *
2019    * <p>This method may block for some time, so it should not be called from a
2020    * time-sensitive thread.
2021    * @param forceFlushAllStores whether we want to flush all stores
2022    * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
2023    * @return whether the flush is success and whether the region needs compacting
2024    *
2025    * @throws IOException general io exceptions
2026    * @throws DroppedSnapshotException Thrown when replay of wal is required
2027    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
2028    * caller MUST abort after this.
2029    */
2030   public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
2031       throws IOException {
2032     // fail-fast instead of waiting on the lock
2033     if (this.closing.get()) {
2034       String msg = "Skipping flush on " + this + " because closing";
2035       LOG.debug(msg);
2036       return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2037     }
2038     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
2039     status.setStatus("Acquiring readlock on region");
2040     // block waiting for the lock for flushing cache
2041     lock.readLock().lock();
2042     try {
2043       if (this.closed.get()) {
2044         String msg = "Skipping flush on " + this + " because closed";
2045         LOG.debug(msg);
2046         status.abort(msg);
2047         return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2048       }
2049       if (coprocessorHost != null) {
2050         status.setStatus("Running coprocessor pre-flush hooks");
2051         coprocessorHost.preFlush();
2052       }
2053       // TODO: this should be managed within memstore with the snapshot, updated only after flush
2054       // successful
2055       if (numMutationsWithoutWAL.get() > 0) {
2056         numMutationsWithoutWAL.set(0);
2057         dataInMemoryWithoutWAL.set(0);
2058       }
2059       synchronized (writestate) {
2060         if (!writestate.flushing && writestate.writesEnabled) {
2061           this.writestate.flushing = true;
2062         } else {
2063           if (LOG.isDebugEnabled()) {
2064             LOG.debug("NOT flushing memstore for region " + this
2065                 + ", flushing=" + writestate.flushing + ", writesEnabled="
2066                 + writestate.writesEnabled);
2067           }
2068           String msg = "Not flushing since "
2069               + (writestate.flushing ? "already flushing"
2070               : "writes not enabled");
2071           status.abort(msg);
2072           return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2073         }
2074       }
2075
2076       try {
2077         Collection<Store> specificStoresToFlush =
2078             forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
2079         FlushResult fs = internalFlushcache(specificStoresToFlush,
2080           status, writeFlushRequestWalMarker);
2081
2082         if (coprocessorHost != null) {
2083           status.setStatus("Running post-flush coprocessor hooks");
2084           coprocessorHost.postFlush();
2085         }
2086
2087         status.markComplete("Flush successful");
2088         return fs;
2089       } finally {
2090         synchronized (writestate) {
2091           writestate.flushing = false;
2092           this.writestate.flushRequested = false;
2093           writestate.notifyAll();
2094         }
2095       }
2096     } finally {
2097       lock.readLock().unlock();
2098       status.cleanup();
2099     }
2100   }
2101
2102   /**
2103    * Should the store be flushed because it is old enough.
2104    * <p>
2105    * Every FlushPolicy should call this to determine whether a store is old enough to flush (except
2106    * that you always flush all stores). Otherwise the method will always
2107    * returns true which will make a lot of flush requests.
2108    */
2109   boolean shouldFlushStore(Store store) {
2110     long earliest = this.wal.getEarliestMemstoreSeqNum(getRegionInfo().getEncodedNameAsBytes(),
2111       store.getFamily().getName()) - 1;
2112     if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) {
2113       if (LOG.isDebugEnabled()) {
2114         LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " +
2115           getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest +
2116           " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint());
2117       }
2118       return true;
2119     }
2120     if (this.flushCheckInterval <= 0) {
2121       return false;
2122     }
2123     long now = EnvironmentEdgeManager.currentTime();
2124     if (store.timeOfOldestEdit() < now - this.flushCheckInterval) {
2125       if (LOG.isDebugEnabled()) {
2126         LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " +
2127           getRegionInfo().getEncodedName() + " because time of oldest edit=" +
2128             store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now);
2129       }
2130       return true;
2131     }
2132     return false;
2133   }
2134
2135   /**
2136    * Should the memstore be flushed now
2137    */
2138   boolean shouldFlush(final StringBuffer whyFlush) {
2139     whyFlush.setLength(0);
2140     // This is a rough measure.
2141     if (this.maxFlushedSeqId > 0
2142           && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint())) {
2143       whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush");
2144       return true;
2145     }
2146     long modifiedFlushCheckInterval = flushCheckInterval;
2147     if (getRegionInfo().isSystemTable() &&
2148         getRegionInfo().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2149       modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL;
2150     }
2151     if (modifiedFlushCheckInterval <= 0) { //disabled
2152       return false;
2153     }
2154     long now = EnvironmentEdgeManager.currentTime();
2155     //if we flushed in the recent past, we don't need to do again now
2156     if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
2157       return false;
2158     }
2159     //since we didn't flush in the recent past, flush now if certain conditions
2160     //are met. Return true on first such memstore hit.
2161     for (Store s : getStores()) {
2162       if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
2163         // we have an old enough edit in the memstore, flush
2164         whyFlush.append(s.toString() + " has an old edit so flush to free WALs");
2165         return true;
2166       }
2167     }
2168     return false;
2169   }
2170
2171   /**
2172    * Flushing all stores.
2173    *
2174    * @see #internalFlushcache(Collection, MonitoredTask, boolean)
2175    */
2176   private FlushResult internalFlushcache(MonitoredTask status)
2177       throws IOException {
2178     return internalFlushcache(stores.values(), status, false);
2179   }
2180
2181   /**
2182    * Flushing given stores.
2183    *
2184    * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean)
2185    */
2186   private FlushResult internalFlushcache(final Collection<Store> storesToFlush,
2187       MonitoredTask status, boolean writeFlushWalMarker) throws IOException {
2188     return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush,
2189         status, writeFlushWalMarker);
2190   }
2191
2192   /**
2193    * Flush the memstore. Flushing the memstore is a little tricky. We have a lot
2194    * of updates in the memstore, all of which have also been written to the wal.
2195    * We need to write those updates in the memstore out to disk, while being
2196    * able to process reads/writes as much as possible during the flush
2197    * operation.
2198    * <p>
2199    * This method may block for some time. Every time you call it, we up the
2200    * regions sequence id even if we don't flush; i.e. the returned region id
2201    * will be at least one larger than the last edit applied to this region. The
2202    * returned id does not refer to an actual edit. The returned id can be used
2203    * for say installing a bulk loaded file just ahead of the last hfile that was
2204    * the result of this flush, etc.
2205    *
2206    * @param wal Null if we're NOT to go via wal.
2207    * @param myseqid The seqid to use if <code>wal</code> is null writing out flush file.
2208    * @param storesToFlush The list of stores to flush.
2209    * @return object describing the flush's state
2210    * @throws IOException general io exceptions
2211    * @throws DroppedSnapshotException Thrown when replay of WAL is required.
2212    */
2213   protected FlushResult internalFlushcache(final WAL wal, final long myseqid,
2214       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2215           throws IOException {
2216     PrepareFlushResult result
2217       = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker);
2218     if (result.result == null) {
2219       return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
2220     } else {
2221       return result.result; // early exit due to failure from prepare stage
2222     }
2223   }
2224
2225   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DLS_DEAD_LOCAL_STORE",
2226       justification="FindBugs seems confused about trxId")
2227   protected PrepareFlushResult internalPrepareFlushCache(final WAL wal, final long myseqid,
2228       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2229   throws IOException {
2230     if (this.rsServices != null && this.rsServices.isAborted()) {
2231       // Don't flush when server aborting, it's unsafe
2232       throw new IOException("Aborting flush because server is aborted...");
2233     }
2234     final long startTime = EnvironmentEdgeManager.currentTime();
2235     // If nothing to flush, return, but return with a valid unused sequenceId.
2236     // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a
2237     // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs
2238     // to no other that it can use to associate with the bulk load. Hence this little dance below
2239     // to go get one.
2240     if (this.memstoreSize.get() <= 0) {
2241       // Take an update lock so no edits can come into memory just yet.
2242       this.updatesLock.writeLock().lock();
2243       WriteEntry writeEntry = null;
2244       try {
2245         if (this.memstoreSize.get() <= 0) {
2246           // Presume that if there are still no edits in the memstore, then there are no edits for
2247           // this region out in the WAL subsystem so no need to do any trickery clearing out
2248           // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for
2249           // sure just beyond the last appended region edit and not associated with any edit
2250           // (useful as marker when bulk loading, etc.).
2251           FlushResult flushResult = null;
2252           if (wal != null) {
2253             writeEntry = mvcc.begin();
2254             long flushOpSeqId = writeEntry.getWriteNumber();
2255             flushResult = new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
2256               flushOpSeqId, "Nothing to flush",
2257             writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
2258             mvcc.completeAndWait(writeEntry);
2259             // Set to null so we don't complete it again down in finally block.
2260             writeEntry = null;
2261             return new PrepareFlushResult(flushResult, myseqid);
2262           } else {
2263             return new PrepareFlushResult(new FlushResultImpl(
2264               FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid);
2265           }
2266         }
2267       } finally {
2268         if (writeEntry != null) {
2269           // If writeEntry is non-null, this operation failed; the mvcc transaction failed...
2270           // but complete it anyways so it doesn't block the mvcc queue.
2271           mvcc.complete(writeEntry);
2272         }
2273         this.updatesLock.writeLock().unlock();
2274       }
2275     }
2276     logFatLineOnFlush(storesToFlush, myseqid);
2277     // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
2278     // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
2279     // allow updates again so its value will represent the size of the updates received
2280     // during flush
2281
2282     // We have to take an update lock during snapshot, or else a write could end up in both snapshot
2283     // and memstore (makes it difficult to do atomic rows then)
2284     status.setStatus("Obtaining lock to block concurrent updates");
2285     // block waiting for the lock for internal flush
2286     this.updatesLock.writeLock().lock();
2287     status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName());
2288     long totalFlushableSizeOfFlushableStores = 0;
2289
2290     Set<byte[]> flushedFamilyNames = new HashSet<byte[]>();
2291     for (Store store: storesToFlush) {
2292       flushedFamilyNames.add(store.getFamily().getName());
2293     }
2294
2295     TreeMap<byte[], StoreFlushContext> storeFlushCtxs
2296       = new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
2297     TreeMap<byte[], List<Path>> committedFiles = new TreeMap<byte[], List<Path>>(
2298         Bytes.BYTES_COMPARATOR);
2299     TreeMap<byte[], Long> storeFlushableSize
2300         = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
2301     // The sequence id of this flush operation which is used to log FlushMarker and pass to
2302     // createFlushContext to use as the store file's sequence id. It can be in advance of edits
2303     // still in the memstore, edits that are in other column families yet to be flushed.
2304     long flushOpSeqId = HConstants.NO_SEQNUM;
2305     // The max flushed sequence id after this flush operation completes. All edits in memstore
2306     // will be in advance of this sequence id.
2307     long flushedSeqId = HConstants.NO_SEQNUM;
2308     byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
2309     try {
2310       if (wal != null) {
2311         Long earliestUnflushedSequenceIdForTheRegion =
2312             wal.startCacheFlush(encodedRegionName, flushedFamilyNames);
2313         if (earliestUnflushedSequenceIdForTheRegion == null) {
2314           // This should never happen. This is how startCacheFlush signals flush cannot proceed.
2315           String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
2316           status.setStatus(msg);
2317           return new PrepareFlushResult(
2318               new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
2319               myseqid);
2320         }
2321         flushOpSeqId = getNextSequenceId(wal);
2322         // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit
2323         flushedSeqId =
2324             earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM?
2325                 flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
2326       } else {
2327         // use the provided sequence Id as WAL is not being used for this flush.
2328         flushedSeqId = flushOpSeqId = myseqid;
2329       }
2330
2331       for (Store s : storesToFlush) {
2332         totalFlushableSizeOfFlushableStores += s.getFlushableSize();
2333         storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId));
2334         committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL
2335         storeFlushableSize.put(s.getFamily().getName(), s.getFlushableSize());
2336       }
2337
2338       // write the snapshot start to WAL
2339       if (wal != null && !writestate.readOnly) {
2340         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
2341             getRegionInfo(), flushOpSeqId, committedFiles);
2342         // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH
2343         WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
2344             mvcc);
2345       }
2346
2347       // Prepare flush (take a snapshot)
2348       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2349         flush.prepare();
2350       }
2351     } catch (IOException ex) {
2352       doAbortFlushToWAL(wal, flushOpSeqId, committedFiles);
2353       throw ex;
2354     } finally {
2355       this.updatesLock.writeLock().unlock();
2356     }
2357     String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " +
2358         "flushsize=" + totalFlushableSizeOfFlushableStores;
2359     status.setStatus(s);
2360     doSyncOfUnflushedWALChanges(wal, getRegionInfo());
2361     return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
2362         flushOpSeqId, flushedSeqId, totalFlushableSizeOfFlushableStores);
2363   }
2364
2365   /**
2366    * Utility method broken out of internalPrepareFlushCache so that method is smaller.
2367    */
2368   private void logFatLineOnFlush(final Collection<Store> storesToFlush, final long sequenceId) {
2369     if (!LOG.isInfoEnabled()) {
2370       return;
2371     }
2372     // Log a fat line detailing what is being flushed.
2373     StringBuilder perCfExtras = null;
2374     if (!isAllFamilies(storesToFlush)) {
2375       perCfExtras = new StringBuilder();
2376       for (Store store: storesToFlush) {
2377         perCfExtras.append("; ").append(store.getColumnFamilyName());
2378         perCfExtras.append("=").append(StringUtils.byteDesc(store.getFlushableSize()));
2379       }
2380     }
2381     LOG.info("Flushing " + + storesToFlush.size() + "/" + stores.size() +
2382         " column families, memstore=" + StringUtils.byteDesc(this.memstoreSize.get()) +
2383         ((perCfExtras != null && perCfExtras.length() > 0)? perCfExtras.toString(): "") +
2384         ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId));
2385   }
2386
2387   private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId,
2388       final Map<byte[], List<Path>> committedFiles) {
2389     if (wal == null) return;
2390     try {
2391       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2392           getRegionInfo(), flushOpSeqId, committedFiles);
2393       WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
2394           mvcc);
2395     } catch (Throwable t) {
2396       LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
2397           StringUtils.stringifyException(t));
2398       // ignore this since we will be aborting the RS with DSE.
2399     }
2400     // we have called wal.startCacheFlush(), now we have to abort it
2401     wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2402   }
2403
2404   /**
2405    * Sync unflushed WAL changes. See HBASE-8208 for details
2406    */
2407   private static void doSyncOfUnflushedWALChanges(final WAL wal, final HRegionInfo hri)
2408   throws IOException {
2409     if (wal == null) {
2410       return;
2411     }
2412     try {
2413       wal.sync(); // ensure that flush marker is sync'ed
2414     } catch (IOException ioe) {
2415       wal.abortCacheFlush(hri.getEncodedNameAsBytes());
2416       throw ioe;
2417     }
2418   }
2419
2420   /**
2421    * @return True if passed Set is all families in the region.
2422    */
2423   private boolean isAllFamilies(final Collection<Store> families) {
2424     return families == null || this.stores.size() == families.size();
2425   }
2426
2427   /**
2428    * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
2429    * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
2430    * @param wal
2431    * @return whether WAL write was successful
2432    */
2433   private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
2434     if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
2435       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
2436         getRegionInfo(), -1, new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR));
2437       try {
2438         WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
2439             mvcc);
2440         return true;
2441       } catch (IOException e) {
2442         LOG.warn(getRegionInfo().getEncodedName() + " : "
2443             + "Received exception while trying to write the flush request to wal", e);
2444       }
2445     }
2446     return false;
2447   }
2448
2449   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
2450       justification="Intentional; notify is about completed flush")
2451   protected FlushResult internalFlushCacheAndCommit(
2452         final WAL wal, MonitoredTask status, final PrepareFlushResult prepareResult,
2453         final Collection<Store> storesToFlush)
2454     throws IOException {
2455
2456     // prepare flush context is carried via PrepareFlushResult
2457     TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
2458     TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
2459     long startTime = prepareResult.startTime;
2460     long flushOpSeqId = prepareResult.flushOpSeqId;
2461     long flushedSeqId = prepareResult.flushedSeqId;
2462     long totalFlushableSizeOfFlushableStores = prepareResult.totalFlushableSize;
2463
2464     String s = "Flushing stores of " + this;
2465     status.setStatus(s);
2466     if (LOG.isTraceEnabled()) LOG.trace(s);
2467
2468     // Any failure from here on out will be catastrophic requiring server
2469     // restart so wal content can be replayed and put back into the memstore.
2470     // Otherwise, the snapshot content while backed up in the wal, it will not
2471     // be part of the current running servers state.
2472     boolean compactionRequested = false;
2473     long flushedOutputFileSize = 0;
2474     try {
2475       // A.  Flush memstore to all the HStores.
2476       // Keep running vector of all store files that includes both old and the
2477       // just-made new flush store file. The new flushed file is still in the
2478       // tmp directory.
2479
2480       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2481         flush.flushCache(status);
2482       }
2483
2484       // Switch snapshot (in memstore) -> new hfile (thus causing
2485       // all the store scanners to reset/reseek).
2486       Iterator<Store> it = storesToFlush.iterator();
2487       // stores.values() and storeFlushCtxs have same order
2488       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2489         boolean needsCompaction = flush.commit(status);
2490         if (needsCompaction) {
2491           compactionRequested = true;
2492         }
2493         byte[] storeName = it.next().getFamily().getName();
2494         List<Path> storeCommittedFiles = flush.getCommittedFiles();
2495         committedFiles.put(storeName, storeCommittedFiles);
2496         // Flush committed no files, indicating flush is empty or flush was canceled
2497         if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) {
2498           totalFlushableSizeOfFlushableStores -= prepareResult.storeFlushableSize.get(storeName);
2499         }
2500         flushedOutputFileSize += flush.getOutputFileSize();
2501       }
2502       storeFlushCtxs.clear();
2503
2504       // Set down the memstore size by amount of flush.
2505       this.addAndGetGlobalMemstoreSize(-totalFlushableSizeOfFlushableStores);
2506
2507       if (wal != null) {
2508         // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
2509         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
2510           getRegionInfo(), flushOpSeqId, committedFiles);
2511         WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
2512             mvcc);
2513       }
2514     } catch (Throwable t) {
2515       // An exception here means that the snapshot was not persisted.
2516       // The wal needs to be replayed so its content is restored to memstore.
2517       // Currently, only a server restart will do this.
2518       // We used to only catch IOEs but its possible that we'd get other
2519       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
2520       // all and sundry.
2521       if (wal != null) {
2522         try {
2523           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2524             getRegionInfo(), flushOpSeqId, committedFiles);
2525           WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc);
2526         } catch (Throwable ex) {
2527           LOG.warn(getRegionInfo().getEncodedName() + " : "
2528               + "failed writing ABORT_FLUSH marker to WAL", ex);
2529           // ignore this since we will be aborting the RS with DSE.
2530         }
2531         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2532       }
2533       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
2534           Bytes.toStringBinary(getRegionInfo().getRegionName()));
2535       dse.initCause(t);
2536       status.abort("Flush failed: " + StringUtils.stringifyException(t));
2537
2538       // Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
2539       // However, since we may have the region read lock, we cannot call close(true) here since
2540       // we cannot promote to a write lock. Instead we are setting closing so that all other region
2541       // operations except for close will be rejected.
2542       this.closing.set(true);
2543
2544       if (rsServices != null) {
2545         // This is a safeguard against the case where the caller fails to explicitly handle aborting
2546         rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
2547       }
2548
2549       throw dse;
2550     }
2551
2552     // If we get to here, the HStores have been written.
2553     for(Store storeToFlush :storesToFlush) {
2554       storeToFlush.finalizeFlush();
2555     }
2556     if (wal != null) {
2557       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2558     }
2559
2560     // Record latest flush time
2561     for (Store store: storesToFlush) {
2562       this.lastStoreFlushTimeMap.put(store, startTime);
2563     }
2564
2565     this.maxFlushedSeqId = flushedSeqId;
2566     this.lastFlushOpSeqId = flushOpSeqId;
2567
2568     // C. Finally notify anyone waiting on memstore to clear:
2569     // e.g. checkResources().
2570     synchronized (this) {
2571       notifyAll(); // FindBugs NN_NAKED_NOTIFY
2572     }
2573
2574     long time = EnvironmentEdgeManager.currentTime() - startTime;
2575     long memstoresize = this.memstoreSize.get();
2576     String msg = "Finished memstore flush of ~"
2577         + StringUtils.byteDesc(totalFlushableSizeOfFlushableStores) + "/"
2578         + totalFlushableSizeOfFlushableStores + ", currentsize="
2579         + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
2580         + " for region " + this + " in " + time + "ms, sequenceid="
2581         + flushOpSeqId +  ", compaction requested=" + compactionRequested
2582         + ((wal == null) ? "; wal=null" : "");
2583     LOG.info(msg);
2584     status.setStatus(msg);
2585
2586     if (rsServices != null && rsServices.getMetrics() != null) {
2587       rsServices.getMetrics().updateFlush(time - startTime,
2588         totalFlushableSizeOfFlushableStores, flushedOutputFileSize);
2589     }
2590
2591     return new FlushResultImpl(compactionRequested ?
2592         FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
2593           FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
2594   }
2595
2596   /**
2597    * Method to safely get the next sequence number.
2598    * @return Next sequence number unassociated with any actual edit.
2599    * @throws IOException
2600    */
2601   @VisibleForTesting
2602   protected long getNextSequenceId(final WAL wal) throws IOException {
2603     WriteEntry we = mvcc.begin();
2604     mvcc.completeAndWait(we);
2605     return we.getWriteNumber();
2606   }
2607
2608   //////////////////////////////////////////////////////////////////////////////
2609   // get() methods for client use.
2610   //////////////////////////////////////////////////////////////////////////////
2611
2612   @Override
2613   public RegionScanner getScanner(Scan scan) throws IOException {
2614    return getScanner(scan, null);
2615   }
2616
2617   @Override
2618   public RegionScanner getScanner(Scan scan, List<KeyValueScanner> additionalScanners)
2619       throws IOException {
2620     startRegionOperation(Operation.SCAN);
2621     try {
2622       // Verify families are all valid
2623       if (!scan.hasFamilies()) {
2624         // Adding all families to scanner
2625         for (byte[] family : this.htableDescriptor.getFamiliesKeys()) {
2626           scan.addFamily(family);
2627         }
2628       } else {
2629         for (byte[] family : scan.getFamilyMap().keySet()) {
2630           checkFamily(family);
2631         }
2632       }
2633       return instantiateRegionScanner(scan, additionalScanners);
2634     } finally {
2635       closeRegionOperation(Operation.SCAN);
2636     }
2637   }
2638
2639   protected RegionScanner instantiateRegionScanner(Scan scan,
2640       List<KeyValueScanner> additionalScanners) throws IOException {
2641     if (scan.isReversed()) {
2642       if (scan.getFilter() != null) {
2643         scan.getFilter().setReversed(true);
2644       }
2645       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
2646     }
2647     return new RegionScannerImpl(scan, additionalScanners, this);
2648   }
2649
2650   @Override
2651   public void prepareDelete(Delete delete) throws IOException {
2652     // Check to see if this is a deleteRow insert
2653     if(delete.getFamilyCellMap().isEmpty()){
2654       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
2655         // Don't eat the timestamp
2656         delete.addFamily(family, delete.getTimeStamp());
2657       }
2658     } else {
2659       for(byte [] family : delete.getFamilyCellMap().keySet()) {
2660         if(family == null) {
2661           throw new NoSuchColumnFamilyException("Empty family is invalid");
2662         }
2663         checkFamily(family);
2664       }
2665     }
2666   }
2667
2668   @Override
2669   public void delete(Delete delete) throws IOException {
2670     checkReadOnly();
2671     checkResources();
2672     startRegionOperation(Operation.DELETE);
2673     try {
2674       delete.getRow();
2675       // All edits for the given row (across all column families) must happen atomically.
2676       doBatchMutate(delete);
2677     } finally {
2678       closeRegionOperation(Operation.DELETE);
2679     }
2680   }
2681
2682   /**
2683    * Row needed by below method.
2684    */
2685   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2686
2687   /**
2688    * This is used only by unit tests. Not required to be a public API.
2689    * @param familyMap map of family to edits for the given family.
2690    * @throws IOException
2691    */
2692   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2693       Durability durability) throws IOException {
2694     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2695     delete.setFamilyCellMap(familyMap);
2696     delete.setDurability(durability);
2697     doBatchMutate(delete);
2698   }
2699
2700   @Override
2701   public void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2702       byte[] byteNow) throws IOException {
2703     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2704
2705       byte[] family = e.getKey();
2706       List<Cell> cells = e.getValue();
2707       assert cells instanceof RandomAccess;
2708
2709       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2710       int listSize = cells.size();
2711       for (int i=0; i < listSize; i++) {
2712         Cell cell = cells.get(i);
2713         //  Check if time is LATEST, change to time of most recent addition if so
2714         //  This is expensive.
2715         if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && CellUtil.isDeleteType(cell)) {
2716           byte[] qual = CellUtil.cloneQualifier(cell);
2717           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2718
2719           Integer count = kvCount.get(qual);
2720           if (count == null) {
2721             kvCount.put(qual, 1);
2722           } else {
2723             kvCount.put(qual, count + 1);
2724           }
2725           count = kvCount.get(qual);
2726
2727           Get get = new Get(CellUtil.cloneRow(cell));
2728           get.setMaxVersions(count);
2729           get.addColumn(family, qual);
2730           if (coprocessorHost != null) {
2731             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2732                 byteNow, get)) {
2733               updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2734             }
2735           } else {
2736             updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2737           }
2738         } else {
2739           CellUtil.updateLatestStamp(cell, byteNow, 0);
2740         }
2741       }
2742     }
2743   }
2744
2745   void updateDeleteLatestVersionTimeStamp(Cell cell, Get get, int count, byte[] byteNow)
2746       throws IOException {
2747     List<Cell> result = get(get, false);
2748
2749     if (result.size() < count) {
2750       // Nothing to delete
2751       CellUtil.updateLatestStamp(cell, byteNow, 0);
2752       return;
2753     }
2754     if (result.size() > count) {
2755       throw new RuntimeException("Unexpected size: " + result.size());
2756     }
2757     Cell getCell = result.get(count - 1);
2758     CellUtil.setTimestamp(cell, getCell.getTimestamp());
2759   }
2760
2761   @Override
2762   public void put(Put put) throws IOException {
2763     checkReadOnly();
2764
2765     // Do a rough check that we have resources to accept a write.  The check is
2766     // 'rough' in that between the resource check and the call to obtain a
2767     // read lock, resources may run out.  For now, the thought is that this
2768     // will be extremely rare; we'll deal with it when it happens.
2769     checkResources();
2770     startRegionOperation(Operation.PUT);
2771     try {
2772       // All edits for the given row (across all column families) must happen atomically.
2773       doBatchMutate(put);
2774     } finally {
2775       closeRegionOperation(Operation.PUT);
2776     }
2777   }
2778
2779   /**
2780    * Struct-like class that tracks the progress of a batch operation,
2781    * accumulating status codes and tracking the index at which processing
2782    * is proceeding.
2783    */
2784   private abstract static class BatchOperation<T> {
2785     T[] operations;
2786     int nextIndexToProcess = 0;
2787     OperationStatus[] retCodeDetails;
2788     WALEdit[] walEditsFromCoprocessors;
2789
2790     public BatchOperation(T[] operations) {
2791       this.operations = operations;
2792       this.retCodeDetails = new OperationStatus[operations.length];
2793       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2794       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2795     }
2796
2797     public abstract Mutation getMutation(int index);
2798     public abstract long getNonceGroup(int index);
2799     public abstract long getNonce(int index);
2800     /** This method is potentially expensive and should only be used for non-replay CP path. */
2801     public abstract Mutation[] getMutationsForCoprocs();
2802     public abstract boolean isInReplay();
2803     public abstract long getReplaySequenceId();
2804
2805     public boolean isDone() {
2806       return nextIndexToProcess == operations.length;
2807     }
2808   }
2809
2810   private static class MutationBatch extends BatchOperation<Mutation> {
2811     private long nonceGroup;
2812     private long nonce;
2813     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2814       super(operations);
2815       this.nonceGroup = nonceGroup;
2816       this.nonce = nonce;
2817     }
2818
2819     @Override
2820     public Mutation getMutation(int index) {
2821       return this.operations[index];
2822     }
2823
2824     @Override
2825     public long getNonceGroup(int index) {
2826       return nonceGroup;
2827     }
2828
2829     @Override
2830     public long getNonce(int index) {
2831       return nonce;
2832     }
2833
2834     @Override
2835     public Mutation[] getMutationsForCoprocs() {
2836       return this.operations;
2837     }
2838
2839     @Override
2840     public boolean isInReplay() {
2841       return false;
2842     }
2843
2844     @Override
2845     public long getReplaySequenceId() {
2846       return 0;
2847     }
2848   }
2849
2850   private static class ReplayBatch extends BatchOperation<MutationReplay> {
2851     private long replaySeqId = 0;
2852     public ReplayBatch(MutationReplay[] operations, long seqId) {
2853       super(operations);
2854       this.replaySeqId = seqId;
2855     }
2856
2857     @Override
2858     public Mutation getMutation(int index) {
2859       return this.operations[index].mutation;
2860     }
2861
2862     @Override
2863     public long getNonceGroup(int index) {
2864       return this.operations[index].nonceGroup;
2865     }
2866
2867     @Override
2868     public long getNonce(int index) {
2869       return this.operations[index].nonce;
2870     }
2871
2872     @Override
2873     public Mutation[] getMutationsForCoprocs() {
2874       assert false;
2875       throw new RuntimeException("Should not be called for replay batch");
2876     }
2877
2878     @Override
2879     public boolean isInReplay() {
2880       return true;
2881     }
2882
2883     @Override
2884     public long getReplaySequenceId() {
2885       return this.replaySeqId;
2886     }
2887   }
2888
2889   @Override
2890   public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce)
2891       throws IOException {
2892     // As it stands, this is used for 3 things
2893     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2894     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2895     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2896     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2897   }
2898
2899   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2900     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2901   }
2902
2903   @Override
2904   public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
2905       throws IOException {
2906     if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
2907         && replaySeqId < lastReplayedOpenRegionSeqId) {
2908       // if it is a secondary replica we should ignore these entries silently
2909       // since they are coming out of order
2910       if (LOG.isTraceEnabled()) {
2911         LOG.trace(getRegionInfo().getEncodedName() + " : "
2912           + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
2913           + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
2914         for (MutationReplay mut : mutations) {
2915           LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
2916         }
2917       }
2918
2919       OperationStatus[] statuses = new OperationStatus[mutations.length];
2920       for (int i = 0; i < statuses.length; i++) {
2921         statuses[i] = OperationStatus.SUCCESS;
2922       }
2923       return statuses;
2924     }
2925     return batchMutate(new ReplayBatch(mutations, replaySeqId));
2926   }
2927
2928   /**
2929    * Perform a batch of mutations.
2930    * It supports only Put and Delete mutations and will ignore other types passed.
2931    * @param batchOp contains the list of mutations
2932    * @return an array of OperationStatus which internally contains the
2933    *         OperationStatusCode and the exceptionMessage if any.
2934    * @throws IOException
2935    */
2936   OperationStatus[] batchMutate(BatchOperation<?> batchOp) throws IOException {
2937     boolean initialized = false;
2938     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
2939     startRegionOperation(op);
2940     try {
2941       while (!batchOp.isDone()) {
2942         if (!batchOp.isInReplay()) {
2943           checkReadOnly();
2944         }
2945         checkResources();
2946
2947         if (!initialized) {
2948           this.writeRequestsCount.add(batchOp.operations.length);
2949           if (!batchOp.isInReplay()) {
2950             doPreBatchMutateHook(batchOp);
2951           }
2952           initialized = true;
2953         }
2954         doMiniBatchMutate(batchOp);
2955         long newSize = this.getMemstoreSize();
2956         requestFlushIfNeeded(newSize);
2957       }
2958     } finally {
2959       closeRegionOperation(op);
2960     }
2961     return batchOp.retCodeDetails;
2962   }
2963
2964   private void doPreBatchMutateHook(BatchOperation<?> batchOp)
2965       throws IOException {
2966     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2967     WALEdit walEdit = new WALEdit();
2968     if (coprocessorHost != null) {
2969       for (int i = 0 ; i < batchOp.operations.length; i++) {
2970         Mutation m = batchOp.getMutation(i);
2971         if (m instanceof Put) {
2972           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2973             // pre hook says skip this Put
2974             // mark as success and skip in doMiniBatchMutation
2975             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2976           }
2977         } else if (m instanceof Delete) {
2978           Delete curDel = (Delete) m;
2979           if (curDel.getFamilyCellMap().isEmpty()) {
2980             // handle deleting a row case
2981             prepareDelete(curDel);
2982           }
2983           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2984             // pre hook says skip this Delete
2985             // mark as success and skip in doMiniBatchMutation
2986             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2987           }
2988         } else {
2989           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2990           // mark the operation return code as failure so that it will not be considered in
2991           // the doMiniBatchMutation
2992           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2993               "Put/Delete mutations only supported in batchMutate() now");
2994         }
2995         if (!walEdit.isEmpty()) {
2996           batchOp.walEditsFromCoprocessors[i] = walEdit;
2997           walEdit = new WALEdit();
2998         }
2999       }
3000     }
3001   }
3002
3003   /**
3004    * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[], long, long)}
3005    * In here we also handle replay of edits on region recover.
3006    * @return Change in size brought about by applying <code>batchOp</code>
3007    */
3008   @SuppressWarnings("unchecked")
3009   // TODO: This needs a rewrite. Doesn't have to be this long. St.Ack 20160120
3010   private long doMiniBatchMutate(BatchOperation<?> batchOp) throws IOException {
3011     boolean replay = batchOp.isInReplay();
3012     // Variable to note if all Put items are for the same CF -- metrics related
3013     boolean putsCfSetConsistent = true;
3014     // Variable to note if all Delete items are for the same CF -- metrics related
3015     boolean deletesCfSetConsistent = true;
3016     // The set of columnFamilies first seen for Put.
3017     Set<byte[]> putsCfSet = null;
3018     // The set of columnFamilies first seen for Delete.
3019     Set<byte[]> deletesCfSet = null;
3020     long currentNonceGroup = HConstants.NO_NONCE;
3021     long currentNonce = HConstants.NO_NONCE;
3022     WALEdit walEdit = null;
3023     boolean locked = false;
3024     // reference family maps directly so coprocessors can mutate them if desired
3025     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
3026     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
3027     int firstIndex = batchOp.nextIndexToProcess;
3028     int lastIndexExclusive = firstIndex;
3029     boolean success = false;
3030     int noOfPuts = 0;
3031     int noOfDeletes = 0;
3032     WriteEntry writeEntry = null;
3033     int cellCount = 0;
3034     /** Keep track of the locks we hold so we can release them in finally clause */
3035     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
3036     long addedSize = 0;
3037     try {
3038       // STEP 1. Try to acquire as many locks as we can, and ensure we acquire at least one.
3039       int numReadyToWrite = 0;
3040       long now = EnvironmentEdgeManager.currentTime();
3041       while (lastIndexExclusive < batchOp.operations.length) {
3042         if (checkBatchOp(batchOp, lastIndexExclusive, familyMaps, now)) {
3043           lastIndexExclusive++;
3044           continue;
3045         }
3046         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
3047         // If we haven't got any rows in our batch, we should block to get the next one.
3048         RowLock rowLock = null;
3049         try {
3050           rowLock = getRowLockInternal(mutation.getRow(), true);
3051         } catch (IOException ioe) {
3052           LOG.warn("Failed getting lock, row=" + Bytes.toStringBinary(mutation.getRow()), ioe);
3053         }
3054         if (rowLock == null) {
3055           // We failed to grab another lock
3056           break; // Stop acquiring more rows for this batch
3057         } else {
3058           acquiredRowLocks.add(rowLock);
3059         }
3060
3061         lastIndexExclusive++;
3062         numReadyToWrite++;
3063         if (replay) {
3064           for (List<Cell> cells : mutation.getFamilyCellMap().values()) {
3065             cellCount += cells.size();
3066           }
3067         }
3068         if (mutation instanceof Put) {
3069           // If Column Families stay consistent through out all of the
3070           // individual puts then metrics can be reported as a multiput across
3071           // column families in the first put.
3072           if (putsCfSet == null) {
3073             putsCfSet = mutation.getFamilyCellMap().keySet();
3074           } else {
3075             putsCfSetConsistent = putsCfSetConsistent
3076                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
3077           }
3078         } else {
3079           if (deletesCfSet == null) {
3080             deletesCfSet = mutation.getFamilyCellMap().keySet();
3081           } else {
3082             deletesCfSetConsistent = deletesCfSetConsistent
3083                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
3084           }
3085         }
3086       }
3087
3088       // We've now grabbed as many mutations off the list as we can
3089
3090       // STEP 2. Update any LATEST_TIMESTAMP timestamps
3091       // We should record the timestamp only after we have acquired the rowLock,
3092       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
3093       now = EnvironmentEdgeManager.currentTime();
3094       byte[] byteNow = Bytes.toBytes(now);
3095
3096       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
3097       if (numReadyToWrite <= 0) {
3098         return 0L;
3099       }
3100
3101       for (int i = firstIndex; !replay && i < lastIndexExclusive; i++) {
3102         // skip invalid
3103         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3104             != OperationStatusCode.NOT_RUN) {
3105           // lastIndexExclusive was incremented above.
3106           continue;
3107         }
3108
3109         Mutation mutation = batchOp.getMutation(i);
3110         if (mutation instanceof Put) {
3111           updateCellTimestamps(familyMaps[i].values(), byteNow);
3112           noOfPuts++;
3113         } else {
3114           prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
3115           noOfDeletes++;
3116         }
3117         rewriteCellTags(familyMaps[i], mutation);
3118         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3119         if (fromCP != null) {
3120           cellCount += fromCP.size();
3121         }
3122         for (List<Cell> cells : familyMaps[i].values()) {
3123           cellCount += cells.size();
3124         }
3125       }
3126       walEdit = new WALEdit(cellCount, replay);
3127       lock(this.updatesLock.readLock(), numReadyToWrite);
3128       locked = true;
3129
3130       // calling the pre CP hook for batch mutation
3131       if (!replay && coprocessorHost != null) {
3132         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3133           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3134           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3135         if (coprocessorHost.preBatchMutate(miniBatchOp)) {
3136           return 0L;
3137         } else {
3138           for (int i = firstIndex; i < lastIndexExclusive; i++) {
3139             if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) {
3140               // lastIndexExclusive was incremented above.
3141               continue;
3142             }
3143             // we pass (i - firstIndex) below since the call expects a relative index
3144             Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - firstIndex);
3145             if (cpMutations == null) {
3146               continue;
3147             }
3148             // Else Coprocessor added more Mutations corresponding to the Mutation at this index.
3149             for (int j = 0; j < cpMutations.length; j++) {
3150               Mutation cpMutation = cpMutations[j];
3151               Map<byte[], List<Cell>> cpFamilyMap = cpMutation.getFamilyCellMap();
3152               checkAndPrepareMutation(cpMutation, replay, cpFamilyMap, now);
3153
3154               // Acquire row locks. If not, the whole batch will fail.
3155               acquiredRowLocks.add(getRowLockInternal(cpMutation.getRow(), true));
3156
3157               if (cpMutation.getDurability() == Durability.SKIP_WAL) {
3158                 recordMutationWithoutWal(cpFamilyMap);
3159               }
3160
3161               // Returned mutations from coprocessor correspond to the Mutation at index i. We can
3162               // directly add the cells from those mutations to the familyMaps of this mutation.
3163               mergeFamilyMaps(familyMaps[i], cpFamilyMap); // will get added to the memstore later
3164             }
3165           }
3166         }
3167       }
3168
3169       // STEP 3. Build WAL edit
3170       Durability durability = Durability.USE_DEFAULT;
3171       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3172         // Skip puts that were determined to be invalid during preprocessing
3173         if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) {
3174           continue;
3175         }
3176
3177         Mutation m = batchOp.getMutation(i);
3178         Durability tmpDur = getEffectiveDurability(m.getDurability());
3179         if (tmpDur.ordinal() > durability.ordinal()) {
3180           durability = tmpDur;
3181         }
3182         if (tmpDur == Durability.SKIP_WAL) {
3183           recordMutationWithoutWal(m.getFamilyCellMap());
3184           continue;
3185         }
3186
3187         long nonceGroup = batchOp.getNonceGroup(i);
3188         long nonce = batchOp.getNonce(i);
3189         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
3190         // Given how nonces are originally written, these should be contiguous.
3191         // They don't have to be, it will still work, just write more WALEdits than needed.
3192         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
3193           // Write what we have so far for nonces out to WAL
3194           appendCurrentNonces(m, replay, walEdit, now, currentNonceGroup, currentNonce);
3195           walEdit = new WALEdit(cellCount, replay);
3196           currentNonceGroup = nonceGroup;
3197           currentNonce = nonce;
3198         }
3199
3200         // Add WAL edits by CP
3201         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3202         if (fromCP != null) {
3203           for (Cell cell : fromCP.getCells()) {
3204             walEdit.add(cell);
3205           }
3206         }
3207         addFamilyMapToWALEdit(familyMaps[i], walEdit);
3208       }
3209
3210       // STEP 4. Append the final edit to WAL and sync.
3211       Mutation mutation = batchOp.getMutation(firstIndex);
3212       WALKey walKey = null;
3213       if (replay) {
3214         // use wal key from the original
3215         walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3216           this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3217           mutation.getClusterIds(), currentNonceGroup, currentNonce, mvcc);
3218         walKey.setOrigLogSeqNum(batchOp.getReplaySequenceId());
3219       }
3220       // Not sure what is going on here when replay is going on... does the below append get
3221       // called for replayed edits? Am afraid to change it without test.
3222       if (!walEdit.isEmpty()) {
3223         if (!replay) {
3224           // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3225           walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3226               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3227               mutation.getClusterIds(), currentNonceGroup, currentNonce, mvcc,
3228               this.getReplicationScope());
3229         }
3230         // TODO: Use the doAppend methods below... complicated by the replay stuff above.
3231         try {
3232           long txid = this.wal.append(this.getRegionInfo(), walKey,
3233               walEdit, true);
3234           if (txid != 0) sync(txid, durability);
3235           writeEntry = walKey.getWriteEntry();
3236         } catch (IOException ioe) {
3237           if (walKey != null) mvcc.complete(walKey.getWriteEntry());
3238           throw ioe;
3239         }
3240       }
3241       if (walKey == null) {
3242         // If no walKey, then skipping WAL or some such. Being an mvcc transaction so sequenceid.
3243         writeEntry = mvcc.begin();
3244       }
3245
3246       // STEP 5. Write back to memstore
3247       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3248         if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) {
3249           continue;
3250         }
3251         addedSize += applyFamilyMapToMemstore(familyMaps[i], replay,
3252             replay? batchOp.getReplaySequenceId(): writeEntry.getWriteNumber());
3253       }
3254
3255       // STEP 6. Complete mvcc.
3256       if (replay) {
3257         this.mvcc.advanceTo(batchOp.getReplaySequenceId());
3258       } else if (writeEntry != null/*Can be null if in replay mode*/) {
3259         mvcc.completeAndWait(writeEntry);
3260         writeEntry = null;
3261       }
3262
3263       // STEP 7. Release row locks, etc.
3264       if (locked) {
3265         this.updatesLock.readLock().unlock();
3266         locked = false;
3267       }
3268       releaseRowLocks(acquiredRowLocks);
3269
3270       // calling the post CP hook for batch mutation
3271       if (!replay && coprocessorHost != null) {
3272         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3273           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3274           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3275         coprocessorHost.postBatchMutate(miniBatchOp);
3276       }
3277
3278       for (int i = firstIndex; i < lastIndexExclusive; i ++) {
3279         if (batchOp.retCodeDetails[i] == OperationStatus.NOT_RUN) {
3280           batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3281         }
3282       }
3283
3284       // STEP 8. Run coprocessor post hooks. This should be done after the wal is
3285       // synced so that the coprocessor contract is adhered to.
3286       if (!replay && coprocessorHost != null) {
3287         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3288           // only for successful puts
3289           if (batchOp.retCodeDetails[i].getOperationStatusCode()
3290               != OperationStatusCode.SUCCESS) {
3291             continue;
3292           }
3293           Mutation m = batchOp.getMutation(i);
3294           if (m instanceof Put) {
3295             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
3296           } else {
3297             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
3298           }
3299         }
3300       }
3301
3302       success = true;
3303       return addedSize;
3304     } finally {
3305       // Call complete rather than completeAndWait because we probably had error if walKey != null
3306       if (writeEntry != null) mvcc.complete(writeEntry);
3307       this.addAndGetGlobalMemstoreSize(addedSize);
3308       if (locked) {
3309         this.updatesLock.readLock().unlock();
3310       }
3311       releaseRowLocks(acquiredRowLocks);
3312
3313       // See if the column families were consistent through the whole thing.
3314       // if they were then keep them. If they were not then pass a null.
3315       // null will be treated as unknown.
3316       // Total time taken might be involving Puts and Deletes.
3317       // Split the time for puts and deletes based on the total number of Puts and Deletes.
3318
3319       if (noOfPuts > 0) {
3320         // There were some Puts in the batch.
3321         if (this.metricsRegion != null) {
3322           this.metricsRegion.updatePut();
3323         }
3324       }
3325       if (noOfDeletes > 0) {
3326         // There were some Deletes in the batch.
3327         if (this.metricsRegion != null) {
3328           this.metricsRegion.updateDelete();
3329         }
3330       }
3331       if (!success) {
3332         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3333           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
3334             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
3335           }
3336         }
3337       }
3338       if (coprocessorHost != null && !batchOp.isInReplay()) {
3339         // call the coprocessor hook to do any finalization steps
3340         // after the put is done
3341         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3342           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3343           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3344         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
3345       }
3346
3347       batchOp.nextIndexToProcess = lastIndexExclusive;
3348     }
3349   }
3350
3351   private void mergeFamilyMaps(Map<byte[], List<Cell>> familyMap,
3352       Map<byte[], List<Cell>> toBeMerged) {
3353     for (Map.Entry<byte[], List<Cell>> entry : toBeMerged.entrySet()) {
3354       List<Cell> cells = familyMap.get(entry.getKey());
3355       if (cells == null) {
3356         familyMap.put(entry.getKey(), entry.getValue());
3357       } else {
3358         cells.addAll(entry.getValue());
3359       }
3360     }
3361   }
3362
3363   private void appendCurrentNonces(final Mutation mutation, final boolean replay,
3364       final WALEdit walEdit, final long now, final long currentNonceGroup, final long currentNonce)
3365   throws IOException {
3366     if (walEdit.isEmpty()) return;
3367     if (!replay) throw new IOException("Multiple nonces per batch and not in replay");
3368     WALKey walKey = new WALKey(this.getRegionInfo().getEncodedNameAsBytes(),
3369         this.htableDescriptor.getTableName(), now, mutation.getClusterIds(),
3370         currentNonceGroup, currentNonce, mvcc, this.getReplicationScope());
3371     this.wal.append(this.getRegionInfo(), walKey, walEdit, true);
3372     // Complete the mvcc transaction started down in append else it will block others
3373     this.mvcc.complete(walKey.getWriteEntry());
3374   }
3375
3376   private boolean checkBatchOp(BatchOperation<?> batchOp, final int lastIndexExclusive,
3377       final Map<byte[], List<Cell>>[] familyMaps, final long now)
3378   throws IOException {
3379     boolean skip = false;
3380     // Skip anything that "ran" already
3381     if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
3382         != OperationStatusCode.NOT_RUN) {
3383       return true;
3384     }
3385     Mutation mutation = batchOp.getMutation(lastIndexExclusive);
3386     Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
3387     // store the family map reference to allow for mutations
3388     familyMaps[lastIndexExclusive] = familyMap;
3389
3390     try {
3391       checkAndPrepareMutation(mutation, batchOp.isInReplay(), familyMap, now);
3392     } catch (NoSuchColumnFamilyException nscf) {
3393       LOG.warn("No such column family in batch mutation", nscf);
3394       batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3395           OperationStatusCode.BAD_FAMILY, nscf.getMessage());
3396       skip = true;
3397     } catch (FailedSanityCheckException fsce) {
3398       LOG.warn("Batch Mutation did not pass sanity check", fsce);
3399       batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3400           OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
3401       skip = true;
3402     } catch (WrongRegionException we) {
3403       LOG.warn("Batch mutation had a row that does not belong to this region", we);
3404       batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3405           OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
3406       skip = true;
3407     }
3408     return skip;
3409   }
3410
3411   private void checkAndPrepareMutation(Mutation mutation, boolean replay,
3412       final Map<byte[], List<Cell>> familyMap, final long now)
3413           throws IOException {
3414     if (mutation instanceof Put) {
3415       // Check the families in the put. If bad, skip this one.
3416       if (replay) {
3417         removeNonExistentColumnFamilyForReplay(familyMap);
3418       } else {
3419         checkFamilies(familyMap.keySet());
3420       }
3421       checkTimestamps(mutation.getFamilyCellMap(), now);
3422     } else {
3423       prepareDelete((Delete)mutation);
3424     }
3425     checkRow(mutation.getRow(), "doMiniBatchMutation");
3426   }
3427
3428   /**
3429    * During replay, there could exist column families which are removed between region server
3430    * failure and replay
3431    */
3432   private void removeNonExistentColumnFamilyForReplay(final Map<byte[], List<Cell>> familyMap) {
3433     List<byte[]> nonExistentList = null;
3434     for (byte[] family : familyMap.keySet()) {
3435       if (!this.htableDescriptor.hasFamily(family)) {
3436         if (nonExistentList == null) {
3437           nonExistentList = new ArrayList<byte[]>();
3438         }
3439         nonExistentList.add(family);
3440       }
3441     }
3442     if (nonExistentList != null) {
3443       for (byte[] family : nonExistentList) {
3444         // Perhaps schema was changed between crash and replay
3445         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3446         familyMap.remove(family);
3447       }
3448     }
3449   }
3450
3451   /**
3452    * Returns effective durability from the passed durability and
3453    * the table descriptor.
3454    */
3455   protected Durability getEffectiveDurability(Durability d) {
3456     return d == Durability.USE_DEFAULT ? this.durability : d;
3457   }
3458
3459   @Override
3460   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
3461       CompareOp compareOp, ByteArrayComparable comparator, Mutation mutation,
3462       boolean writeToWAL)
3463   throws IOException{
3464     checkMutationType(mutation, row);
3465     return doCheckAndRowMutate(row, family, qualifier, compareOp, comparator, null,
3466       mutation, writeToWAL);
3467   }
3468
3469   @Override
3470   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3471       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
3472       boolean writeToWAL)
3473   throws IOException {
3474     return doCheckAndRowMutate(row, family, qualifier, compareOp, comparator, rm, null,
3475       writeToWAL);
3476   }
3477
3478   /**
3479    * checkAndMutate and checkAndRowMutate are 90% the same. Rather than copy/paste, below has
3480    * switches in the few places where there is deviation.
3481    */
3482   private boolean doCheckAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3483       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rowMutations,
3484       Mutation mutation, boolean writeToWAL)
3485   throws IOException {
3486     // Could do the below checks but seems wacky with two callers only. Just comment out for now.
3487     // One caller passes a Mutation, the other passes RowMutation. Presume all good so we don't
3488     // need these commented out checks.
3489     // if (rowMutations == null && mutation == null) throw new DoNotRetryIOException("Both null");
3490     // if (rowMutations != null && mutation != null) throw new DoNotRetryIOException("Both set");
3491     checkReadOnly();
3492     // TODO, add check for value length also move this check to the client
3493     checkResources();
3494     startRegionOperation();
3495     try {
3496       Get get = new Get(row);
3497       checkFamily(family);
3498       get.addColumn(family, qualifier);
3499       // Lock row - note that doBatchMutate will relock this row if called
3500       checkRow(row, "doCheckAndRowMutate");
3501       RowLock rowLock = getRowLockInternal(get.getRow(), false);
3502       try {
3503         if (mutation != null && this.getCoprocessorHost() != null) {
3504           // Call coprocessor.
3505           Boolean processed = null;
3506           if (mutation instanceof Put) {
3507             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
3508                 qualifier, compareOp, comparator, (Put)mutation);
3509           } else if (mutation instanceof Delete) {
3510             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
3511                 qualifier, compareOp, comparator, (Delete)mutation);
3512           }
3513           if (processed != null) {
3514             return processed;
3515           }
3516         }
3517         // NOTE: We used to wait here until mvcc caught up:  mvcc.await();
3518         // Supposition is that now all changes are done under row locks, then when we go to read,
3519         // we'll get the latest on this row.
3520         List<Cell> result = get(get, false);
3521         boolean valueIsNull = comparator.getValue() == null || comparator.getValue().length == 0;
3522         boolean matches = false;
3523         long cellTs = 0;
3524         if (result.size() == 0 && valueIsNull) {
3525           matches = true;
3526         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && valueIsNull) {
3527           matches = true;
3528           cellTs = result.get(0).getTimestamp();
3529         } else if (result.size() == 1 && !valueIsNull) {
3530           Cell kv = result.get(0);
3531           cellTs = kv.getTimestamp();
3532           int compareResult = CellComparator.compareValue(kv, comparator);
3533           matches = matches(compareOp, compareResult);
3534         }
3535         // If matches put the new put or delete the new delete
3536         if (matches) {
3537           // We have acquired the row lock already. If the system clock is NOT monotonically
3538           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
3539           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
3540           // there is no way to pass the cellTs. See HBASE-14054.
3541           long now = EnvironmentEdgeManager.currentTime();
3542           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
3543           byte[] byteTs = Bytes.toBytes(ts);
3544           if (mutation != null) {
3545             if (mutation instanceof Put) {
3546               updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs);
3547             }
3548             // And else 'delete' is not needed since it already does a second get, and sets the
3549             // timestamp from get (see prepareDeleteTimestamps).
3550           } else {
3551             for (Mutation m: rowMutations.getMutations()) {
3552               if (m instanceof Put) {
3553                 updateCellTimestamps(m.getFamilyCellMap().values(), byteTs);
3554               }
3555             }
3556             // And else 'delete' is not needed since it already does a second get, and sets the
3557             // timestamp from get (see prepareDeleteTimestamps).
3558           }
3559           // All edits for the given row (across all column families) must happen atomically.
3560           if (mutation != null) {
3561             doBatchMutate(mutation);
3562           } else {
3563             mutateRow(rowMutations);
3564           }
3565           this.checkAndMutateChecksPassed.increment();
3566           return true;
3567         }
3568         this.checkAndMutateChecksFailed.increment();
3569         return false;
3570       } finally {
3571         rowLock.release();
3572       }
3573     } finally {
3574       closeRegionOperation();
3575     }
3576   }
3577
3578   private void checkMutationType(final Mutation mutation, final byte [] row)
3579   throws DoNotRetryIOException {
3580     boolean isPut = mutation instanceof Put;
3581     if (!isPut && !(mutation instanceof Delete)) {
3582       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must be Put or Delete");
3583     }
3584     if (!Bytes.equals(row, mutation.getRow())) {
3585       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match");
3586     }
3587   }
3588
3589   private boolean matches(final CompareOp compareOp, final int compareResult) {
3590     boolean matches = false;
3591     switch (compareOp) {
3592       case LESS:
3593         matches = compareResult < 0;
3594         break;
3595       case LESS_OR_EQUAL:
3596         matches = compareResult <= 0;
3597         break;
3598       case EQUAL:
3599         matches = compareResult == 0;
3600         break;
3601       case NOT_EQUAL:
3602         matches = compareResult != 0;
3603         break;
3604       case GREATER_OR_EQUAL:
3605         matches = compareResult >= 0;
3606         break;
3607       case GREATER:
3608         matches = compareResult > 0;
3609         break;
3610       default:
3611         throw new RuntimeException("Unknown Compare op " + compareOp.name());
3612     }
3613     return matches;
3614   }
3615
3616
3617   private void doBatchMutate(Mutation mutation) throws IOException {
3618     // Currently this is only called for puts and deletes, so no nonces.
3619     OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation});
3620     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
3621       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
3622     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
3623       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
3624     }
3625   }
3626
3627   /**
3628    * Complete taking the snapshot on the region. Writes the region info and adds references to the
3629    * working snapshot directory.
3630    *
3631    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
3632    * arg.  (In the future other cancellable HRegion methods could eventually add a
3633    * {@link ForeignExceptionSnare}, or we could do something fancier).
3634    *
3635    * @param desc snapshot description object
3636    * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
3637    *   bail out.  This is allowed to be null and will just be ignored in that case.
3638    * @throws IOException if there is an external or internal error causing the snapshot to fail
3639    */
3640   public void addRegionToSnapshot(SnapshotDescription desc,
3641       ForeignExceptionSnare exnSnare) throws IOException {
3642     Path rootDir = FSUtils.getRootDir(conf);
3643     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
3644
3645     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
3646             snapshotDir, desc, exnSnare);
3647     manifest.addRegion(this);
3648
3649     // The regionserver holding the first region of the table is responsible for taking the
3650     // manifest of the mob dir.
3651     if (!Bytes.equals(getRegionInfo().getStartKey(), HConstants.EMPTY_START_ROW))
3652       return;
3653
3654     // if any cf's have is mob enabled, add the "mob region" to the manifest.
3655     List<Store> stores = getStores();
3656     for (Store store : stores) {
3657       boolean hasMobStore = store.getFamily().isMobEnabled();
3658       if (hasMobStore) {
3659         // use the .mob as the start key and 0 as the regionid
3660         HRegionInfo mobRegionInfo = MobUtils.getMobRegionInfo(this.getTableDesc().getTableName());
3661         mobRegionInfo.setOffline(true);
3662         manifest.addMobRegion(mobRegionInfo, this.getTableDesc().getColumnFamilies());
3663         return;
3664       }
3665     }
3666   }
3667
3668   @Override
3669   public void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now)
3670       throws IOException {
3671     for (List<Cell> cells: cellItr) {
3672       if (cells == null) continue;
3673       assert cells instanceof RandomAccess;
3674       int listSize = cells.size();
3675       for (int i = 0; i < listSize; i++) {
3676         CellUtil.updateLatestStamp(cells.get(i), now, 0);
3677       }
3678     }
3679   }
3680
3681   /**
3682    * Possibly rewrite incoming cell tags.
3683    */
3684   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
3685     // Check if we have any work to do and early out otherwise
3686     // Update these checks as more logic is added here
3687     if (m.getTTL() == Long.MAX_VALUE) {
3688       return;
3689     }
3690
3691     // From this point we know we have some work to do
3692     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
3693       List<Cell> cells = e.getValue();
3694       assert cells instanceof RandomAccess;
3695       int listSize = cells.size();
3696       for (int i = 0; i < listSize; i++) {
3697         Cell cell = cells.get(i);
3698         List<Tag> newTags = TagUtil.carryForwardTags(null, cell);
3699         newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL());
3700         // Rewrite the cell with the updated set of tags
3701         cells.set(i, new TagRewriteCell(cell, TagUtil.fromList(newTags)));
3702       }
3703     }
3704   }
3705
3706   /*
3707    * Check if resources to support an update.
3708    *
3709    * We throw RegionTooBusyException if above memstore limit
3710    * and expect client to retry using some kind of backoff
3711   */
3712   private void checkResources() throws RegionTooBusyException {
3713     // If catalog region, do not impose resource constraints or block updates.
3714     if (this.getRegionInfo().isMetaRegion()) return;
3715
3716     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3717       blockedRequestsCount.increment();
3718       requestFlush();
3719       throw new RegionTooBusyException("Above memstore limit, " +
3720           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3721           this.getRegionInfo().getRegionNameAsString()) +
3722           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3723           this.getRegionServerServices().getServerName()) +
3724           ", memstoreSize=" + memstoreSize.get() +
3725           ", blockingMemStoreSize=" + blockingMemStoreSize);
3726     }
3727   }
3728
3729   /**
3730    * @throws IOException Throws exception if region is in read-only mode.
3731    */
3732   protected void checkReadOnly() throws IOException {
3733     if (isReadOnly()) {
3734       throw new DoNotRetryIOException("region is read only");
3735     }
3736   }
3737
3738   protected void checkReadsEnabled() throws IOException {
3739     if (!this.writestate.readsEnabled) {
3740       throw new IOException(getRegionInfo().getEncodedName()
3741         + ": The region's reads are disabled. Cannot serve the request");
3742     }
3743   }
3744
3745   public void setReadsEnabled(boolean readsEnabled) {
3746    if (readsEnabled && !this.writestate.readsEnabled) {
3747      LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
3748     }
3749     this.writestate.setReadsEnabled(readsEnabled);
3750   }
3751
3752   /**
3753    * Add updates first to the wal and then add values to memstore.
3754    * Warning: Assumption is caller has lock on passed in row.
3755    * @param edits Cell updates by column
3756    * @throws IOException
3757    */
3758   private void put(final byte [] row, byte [] family, List<Cell> edits)
3759   throws IOException {
3760     NavigableMap<byte[], List<Cell>> familyMap;
3761     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3762
3763     familyMap.put(family, edits);
3764     Put p = new Put(row);
3765     p.setFamilyCellMap(familyMap);
3766     doBatchMutate(p);
3767   }
3768
3769   /**
3770    * Atomically apply the given map of family->edits to the memstore.
3771    * This handles the consistency control on its own, but the caller
3772    * should already have locked updatesLock.readLock(). This also does
3773    * <b>not</b> check the families for validity.
3774    *
3775    * @param familyMap Map of Cells by family
3776    * @return the additional memory usage of the memstore caused by the new entries.
3777    */
3778   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap, boolean replay,
3779       long sequenceId)
3780   throws IOException {
3781     long size = 0;
3782     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3783       byte[] family = e.getKey();
3784       List<Cell> cells = e.getValue();
3785       assert cells instanceof RandomAccess;
3786       size += applyToMemstore(getStore(family), cells, false, replay, sequenceId);
3787     }
3788     return size;
3789   }
3790
3791   /**
3792    * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be
3793    *  set; when set we will run operations that make sense in the increment/append scenario but
3794    *  that do not make sense otherwise.
3795    * @return Memstore change in size on insert of these Cells.
3796    * @see #applyToMemstore(Store, Cell, long)
3797    */
3798   private long applyToMemstore(final Store store, final List<Cell> cells,
3799       final boolean delta, boolean replay, long sequenceId)
3800   throws IOException {
3801     // Any change in how we update Store/MemStore needs to also be done in other applyToMemstore!!!!
3802     long size = 0;
3803     boolean upsert = delta && store.getFamily().getMaxVersions() == 1;
3804     int count = cells.size();
3805     if (upsert) {
3806       size += store.upsert(cells, getSmallestReadPoint());
3807     } else {
3808       for (int i = 0; i < count; i++) {
3809         Cell cell = cells.get(i);
3810         // TODO: This looks wrong.. checking for sequenceid of zero is expensive!!!!! St.Ack
3811         // When is it zero anyways? When replay? Then just rely on that flag.
3812         if (cell.getSequenceId() == 0 || replay) {
3813           CellUtil.setSequenceId(cell, sequenceId);
3814         }
3815         size += store.add(cell);
3816       }
3817     }
3818     return size;
3819   }
3820
3821   /**
3822    * @return Memstore change in size on insert of these Cells.
3823    * @see #applyToMemstore(Store, List, boolean, boolean, long)
3824    */
3825   private long applyToMemstore(final Store store, final Cell cell, long sequenceId)
3826   throws IOException {
3827     // Any change in how we update Store/MemStore needs to also be done in other applyToMemstore!!!!
3828     if (store == null) {
3829       checkFamily(CellUtil.cloneFamily(cell));
3830       // Unreachable because checkFamily will throw exception
3831     }
3832     return store.add(cell);
3833   }
3834
3835   @Override
3836   public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException {
3837     for (byte[] family : families) {
3838       checkFamily(family);
3839     }
3840   }
3841
3842   @Override
3843   public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now)
3844       throws FailedSanityCheckException {
3845     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
3846       return;
3847     }
3848     long maxTs = now + timestampSlop;
3849     for (List<Cell> kvs : familyMap.values()) {
3850       assert kvs instanceof RandomAccess;
3851       int listSize  = kvs.size();
3852       for (int i=0; i < listSize; i++) {
3853         Cell cell = kvs.get(i);
3854         // see if the user-side TS is out of range. latest = server-side
3855         long ts = cell.getTimestamp();
3856         if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
3857           throw new FailedSanityCheckException("Timestamp for KV out of range "
3858               + cell + " (too.new=" + timestampSlop + ")");
3859         }
3860       }
3861     }
3862   }
3863
3864   /**
3865    * Append the given map of family->edits to a WALEdit data structure.
3866    * This does not write to the WAL itself.
3867    * @param familyMap map of family->edits
3868    * @param walEdit the destination entry to append into
3869    */
3870   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
3871       WALEdit walEdit) {
3872     for (List<Cell> edits : familyMap.values()) {
3873       assert edits instanceof RandomAccess;
3874       int listSize = edits.size();
3875       for (int i=0; i < listSize; i++) {
3876         Cell cell = edits.get(i);
3877         walEdit.add(cell);
3878       }
3879     }
3880   }
3881
3882   private void requestFlushIfNeeded(long memstoreTotalSize) throws RegionTooBusyException {
3883     if(memstoreTotalSize > this.getMemstoreFlushSize()) {
3884       requestFlush();
3885     }
3886   }
3887
3888   private void requestFlush() {
3889     if (this.rsServices == null) {
3890       return;
3891     }
3892     synchronized (writestate) {
3893       if (this.writestate.isFlushRequested()) {
3894         return;
3895       }
3896       writestate.flushRequested = true;
3897     }
3898     // Make request outside of synchronize block; HBASE-818.
3899     this.rsServices.getFlushRequester().requestFlush(this, false);
3900     if (LOG.isDebugEnabled()) {
3901       LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName());
3902     }
3903   }
3904
3905   /*
3906    * @param size
3907    * @return True if size is over the flush threshold
3908    */
3909   private boolean isFlushSize(final long size) {
3910     return size > this.memstoreFlushSize;
3911   }
3912
3913   /**
3914    * Read the edits put under this region by wal splitting process.  Put
3915    * the recovered edits back up into this region.
3916    *
3917    * <p>We can ignore any wal message that has a sequence ID that's equal to or
3918    * lower than minSeqId.  (Because we know such messages are already
3919    * reflected in the HFiles.)
3920    *
3921    * <p>While this is running we are putting pressure on memory yet we are
3922    * outside of our usual accounting because we are not yet an onlined region
3923    * (this stuff is being run as part of Region initialization).  This means
3924    * that if we're up against global memory limits, we'll not be flagged to flush
3925    * because we are not online. We can't be flushed by usual mechanisms anyways;
3926    * we're not yet online so our relative sequenceids are not yet aligned with
3927    * WAL sequenceids -- not till we come up online, post processing of split
3928    * edits.
3929    *
3930    * <p>But to help relieve memory pressure, at least manage our own heap size
3931    * flushing if are in excess of per-region limits.  Flushing, though, we have
3932    * to be careful and avoid using the regionserver/wal sequenceid.  Its running
3933    * on a different line to whats going on in here in this region context so if we
3934    * crashed replaying these edits, but in the midst had a flush that used the
3935    * regionserver wal with a sequenceid in excess of whats going on in here
3936    * in this region and with its split editlogs, then we could miss edits the
3937    * next time we go to recover. So, we have to flush inline, using seqids that
3938    * make sense in a this single region context only -- until we online.
3939    *
3940    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
3941    * the maxSeqId for the store to be applied, else its skipped.
3942    * @return the sequence id of the last edit added to this region out of the
3943    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3944    * @throws IOException
3945    */
3946   protected long replayRecoveredEditsIfAny(final Path regiondir,
3947       Map<byte[], Long> maxSeqIdInStores,
3948       final CancelableProgressable reporter, final MonitoredTask status)
3949       throws IOException {
3950     long minSeqIdForTheRegion = -1;
3951     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3952       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3953         minSeqIdForTheRegion = maxSeqIdInStore;
3954       }
3955     }
3956     long seqid = minSeqIdForTheRegion;
3957
3958     FileSystem fs = this.fs.getFileSystem();
3959     NavigableSet<Path> files = WALSplitter.getSplitEditFilesSorted(fs, regiondir);
3960     if (LOG.isDebugEnabled()) {
3961       LOG.debug("Found " + (files == null ? 0 : files.size())
3962         + " recovered edits file(s) under " + regiondir);
3963     }
3964
3965     if (files == null || files.isEmpty()) return seqid;
3966
3967     for (Path edits: files) {
3968       if (edits == null || !fs.exists(edits)) {
3969         LOG.warn("Null or non-existent edits file: " + edits);
3970         continue;
3971       }
3972       if (isZeroLengthThenDelete(fs, edits)) continue;
3973
3974       long maxSeqId;
3975       String fileName = edits.getName();
3976       maxSeqId = Math.abs(Long.parseLong(fileName));
3977       if (maxSeqId <= minSeqIdForTheRegion) {
3978         if (LOG.isDebugEnabled()) {
3979           String msg = "Maximum sequenceid for this wal is " + maxSeqId
3980             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3981             + ", skipped the whole file, path=" + edits;
3982           LOG.debug(msg);
3983         }
3984         continue;
3985       }
3986
3987       try {
3988         // replay the edits. Replay can return -1 if everything is skipped, only update
3989         // if seqId is greater
3990         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
3991       } catch (IOException e) {
3992         boolean skipErrors = conf.getBoolean(
3993             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3994             conf.getBoolean(
3995                 "hbase.skip.errors",
3996                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
3997         if (conf.get("hbase.skip.errors") != null) {
3998           LOG.warn(
3999               "The property 'hbase.skip.errors' has been deprecated. Please use " +
4000               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
4001         }
4002         if (skipErrors) {
4003           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4004           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
4005               + "=true so continuing. Renamed " + edits +
4006               " as " + p, e);
4007         } else {
4008           throw e;
4009         }
4010       }
4011     }
4012     // The edits size added into rsAccounting during this replaying will not
4013     // be required any more. So just clear it.
4014     if (this.rsAccounting != null) {
4015       this.rsAccounting.clearRegionReplayEditsSize(getRegionInfo().getRegionName());
4016     }
4017     if (seqid > minSeqIdForTheRegion) {
4018       // Then we added some edits to memory. Flush and cleanup split edit files.
4019       internalFlushcache(null, seqid, stores.values(), status, false);
4020     }
4021     // Now delete the content of recovered edits.  We're done w/ them.
4022     if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
4023       // For debugging data loss issues!
4024       // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
4025       // column family. Have to fake out file type too by casting our recovered.edits as storefiles
4026       String fakeFamilyName = WALSplitter.getRegionDirRecoveredEditsDir(regiondir).getName();
4027       Set<StoreFile> fakeStoreFiles = new HashSet<StoreFile>(files.size());
4028       for (Path file: files) {
4029         fakeStoreFiles.add(new StoreFile(getRegionFileSystem().getFileSystem(), file, this.conf,
4030           null, null));
4031       }
4032       getRegionFileSystem().removeStoreFiles(fakeFamilyName, fakeStoreFiles);
4033     } else {
4034       for (Path file: files) {
4035         if (!fs.delete(file, false)) {
4036           LOG.error("Failed delete of " + file);
4037         } else {
4038           LOG.debug("Deleted recovered.edits file=" + file);
4039         }
4040       }
4041     }
4042     return seqid;
4043   }
4044
4045   /*
4046    * @param edits File of recovered edits.
4047    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in wal
4048    * must be larger than this to be replayed for each store.
4049    * @param reporter
4050    * @return the sequence id of the last edit added to this region out of the
4051    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
4052    * @throws IOException
4053    */
4054   private long replayRecoveredEdits(final Path edits,
4055       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
4056     throws IOException {
4057     String msg = "Replaying edits from " + edits;
4058     LOG.info(msg);
4059     MonitoredTask status = TaskMonitor.get().createStatus(msg);
4060     FileSystem fs = this.fs.getFileSystem();
4061
4062     status.setStatus("Opening recovered edits");
4063     WAL.Reader reader = null;
4064     try {
4065       reader = WALFactory.createReader(fs, edits, conf);
4066       long currentEditSeqId = -1;
4067       long currentReplaySeqId = -1;
4068       long firstSeqIdInLog = -1;
4069       long skippedEdits = 0;
4070       long editsCount = 0;
4071       long intervalEdits = 0;
4072       WAL.Entry entry;
4073       Store store = null;
4074       boolean reported_once = false;
4075       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
4076
4077       try {
4078         // How many edits seen before we check elapsed time
4079         int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
4080         // How often to send a progress report (default 1/2 master timeout)
4081         int period = this.conf.getInt("hbase.hstore.report.period", 300000);
4082         long lastReport = EnvironmentEdgeManager.currentTime();
4083
4084         if (coprocessorHost != null) {
4085           coprocessorHost.preReplayWALs(this.getRegionInfo(), edits);
4086         }
4087
4088         while ((entry = reader.next()) != null) {
4089           WALKey key = entry.getKey();
4090           WALEdit val = entry.getEdit();
4091
4092           if (ng != null) { // some test, or nonces disabled
4093             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
4094           }
4095
4096           if (reporter != null) {
4097             intervalEdits += val.size();
4098             if (intervalEdits >= interval) {
4099               // Number of edits interval reached
4100               intervalEdits = 0;
4101               long cur = EnvironmentEdgeManager.currentTime();
4102               if (lastReport + period <= cur) {
4103                 status.setStatus("Replaying edits..." +
4104                     " skipped=" + skippedEdits +
4105                     " edits=" + editsCount);
4106                 // Timeout reached
4107                 if(!reporter.progress()) {
4108                   msg = "Progressable reporter failed, stopping replay";
4109                   LOG.warn(msg);
4110                   status.abort(msg);
4111                   throw new IOException(msg);
4112                 }
4113                 reported_once = true;
4114                 lastReport = cur;
4115               }
4116             }
4117           }
4118
4119           if (firstSeqIdInLog == -1) {
4120             firstSeqIdInLog = key.getLogSeqNum();
4121           }
4122           if (currentEditSeqId > key.getLogSeqNum()) {
4123             // when this condition is true, it means we have a serious defect because we need to
4124             // maintain increasing SeqId for WAL edits per region
4125             LOG.error(getRegionInfo().getEncodedName() + " : "
4126                  + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
4127                 + "; edit=" + val);
4128           } else {
4129             currentEditSeqId = key.getLogSeqNum();
4130           }
4131           currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
4132             key.getOrigLogSeqNum() : currentEditSeqId;
4133
4134           // Start coprocessor replay here. The coprocessor is for each WALEdit
4135           // instead of a KeyValue.
4136           if (coprocessorHost != null) {
4137             status.setStatus("Running pre-WAL-restore hook in coprocessors");
4138             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
4139               // if bypass this wal entry, ignore it ...
4140               continue;
4141             }
4142           }
4143           boolean checkRowWithinBoundary = false;
4144           // Check this edit is for this region.
4145           if (!Bytes.equals(key.getEncodedRegionName(),
4146               this.getRegionInfo().getEncodedNameAsBytes())) {
4147             checkRowWithinBoundary = true;
4148           }
4149
4150           boolean flush = false;
4151           for (Cell cell: val.getCells()) {
4152             // Check this edit is for me. Also, guard against writing the special
4153             // METACOLUMN info such as HBASE::CACHEFLUSH entries
4154             if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
4155               // if region names don't match, skipp replaying compaction marker
4156               if (!checkRowWithinBoundary) {
4157                 //this is a special edit, we should handle it
4158                 CompactionDescriptor compaction = WALEdit.getCompaction(cell);
4159                 if (compaction != null) {
4160                   //replay the compaction
4161                   replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
4162                 }
4163               }
4164               skippedEdits++;
4165               continue;
4166             }
4167             // Figure which store the edit is meant for.
4168             if (store == null || !CellUtil.matchingFamily(cell, store.getFamily().getName())) {
4169               store = getStore(cell);
4170             }
4171             if (store == null) {
4172               // This should never happen.  Perhaps schema was changed between
4173               // crash and redeploy?
4174               LOG.warn("No family for " + cell);
4175               skippedEdits++;
4176               continue;
4177             }
4178             if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(),
4179               cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())) {
4180               LOG.warn("Row of " + cell + " is not within region boundary");
4181               skippedEdits++;
4182               continue;
4183             }
4184             // Now, figure if we should skip this edit.
4185             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
4186                 .getName())) {
4187               skippedEdits++;
4188               continue;
4189             }
4190             CellUtil.setSequenceId(cell, currentReplaySeqId);
4191
4192             // Once we are over the limit, restoreEdit will keep returning true to
4193             // flush -- but don't flush until we've played all the kvs that make up
4194             // the WALEdit.
4195             flush |= restoreEdit(store, cell);
4196             editsCount++;
4197           }
4198           if (flush) {
4199             internalFlushcache(null, currentEditSeqId, stores.values(), status, false);
4200           }
4201
4202           if (coprocessorHost != null) {
4203             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
4204           }
4205         }
4206
4207         if (coprocessorHost != null) {
4208           coprocessorHost.postReplayWALs(this.getRegionInfo(), edits);
4209         }
4210       } catch (EOFException eof) {
4211         Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4212         msg = "Encountered EOF. Most likely due to Master failure during " +
4213             "wal splitting, so we have this data in another edit.  " +
4214             "Continuing, but renaming " + edits + " as " + p;
4215         LOG.warn(msg, eof);
4216         status.abort(msg);
4217       } catch (IOException ioe) {
4218         // If the IOE resulted from bad file format,
4219         // then this problem is idempotent and retrying won't help
4220         if (ioe.getCause() instanceof ParseException) {
4221           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4222           msg = "File corruption encountered!  " +
4223               "Continuing, but renaming " + edits + " as " + p;
4224           LOG.warn(msg, ioe);
4225           status.setStatus(msg);
4226         } else {
4227           status.abort(StringUtils.stringifyException(ioe));
4228           // other IO errors may be transient (bad network connection,
4229           // checksum exception on one datanode, etc).  throw & retry
4230           throw ioe;
4231         }
4232       }
4233       if (reporter != null && !reported_once) {
4234         reporter.progress();
4235       }
4236       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
4237         ", firstSequenceIdInLog=" + firstSeqIdInLog +
4238         ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
4239       status.markComplete(msg);
4240       LOG.debug(msg);
4241       return currentEditSeqId;
4242     } finally {
4243       status.cleanup();
4244       if (reader != null) {
4245          reader.close();
4246       }
4247     }
4248   }
4249
4250   /**
4251    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
4252    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
4253    * See HBASE-2331.
4254    */
4255   void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
4256       boolean removeFiles, long replaySeqId)
4257       throws IOException {
4258     try {
4259       checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
4260         "Compaction marker from WAL ", compaction);
4261     } catch (WrongRegionException wre) {
4262       if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4263         // skip the compaction marker since it is not for this region
4264         return;
4265       }
4266       throw wre;
4267     }
4268
4269     synchronized (writestate) {
4270       if (replaySeqId < lastReplayedOpenRegionSeqId) {
4271         LOG.warn(getRegionInfo().getEncodedName() + " : "
4272             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4273             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4274             + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4275         return;
4276       }
4277       if (replaySeqId < lastReplayedCompactionSeqId) {
4278         LOG.warn(getRegionInfo().getEncodedName() + " : "
4279             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4280             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4281             + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
4282         return;
4283       } else {
4284         lastReplayedCompactionSeqId = replaySeqId;
4285       }
4286
4287       if (LOG.isDebugEnabled()) {
4288         LOG.debug(getRegionInfo().getEncodedName() + " : "
4289             + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
4290             + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
4291             + lastReplayedOpenRegionSeqId);
4292       }
4293
4294       startRegionOperation(Operation.REPLAY_EVENT);
4295       try {
4296         Store store = this.getStore(compaction.getFamilyName().toByteArray());
4297         if (store == null) {
4298           LOG.warn(getRegionInfo().getEncodedName() + " : "
4299               + "Found Compaction WAL edit for deleted family:"
4300               + Bytes.toString(compaction.getFamilyName().toByteArray()));
4301           return;
4302         }
4303         store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
4304         logRegionFiles();
4305       } catch (FileNotFoundException ex) {
4306         LOG.warn(getRegionInfo().getEncodedName() + " : "
4307             + "At least one of the store files in compaction: "
4308             + TextFormat.shortDebugString(compaction)
4309             + " doesn't exist any more. Skip loading the file(s)", ex);
4310       } finally {
4311         closeRegionOperation(Operation.REPLAY_EVENT);
4312       }
4313     }
4314   }
4315
4316   void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
4317     checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
4318       "Flush marker from WAL ", flush);
4319
4320     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4321       return; // if primary nothing to do
4322     }
4323
4324     if (LOG.isDebugEnabled()) {
4325       LOG.debug(getRegionInfo().getEncodedName() + " : "
4326           + "Replaying flush marker " + TextFormat.shortDebugString(flush));
4327     }
4328
4329     startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
4330     try {
4331       FlushAction action = flush.getAction();
4332       switch (action) {
4333       case START_FLUSH:
4334         replayWALFlushStartMarker(flush);
4335         break;
4336       case COMMIT_FLUSH:
4337         replayWALFlushCommitMarker(flush);
4338         break;
4339       case ABORT_FLUSH:
4340         replayWALFlushAbortMarker(flush);
4341         break;
4342       case CANNOT_FLUSH:
4343         replayWALFlushCannotFlushMarker(flush, replaySeqId);
4344         break;
4345       default:
4346         LOG.warn(getRegionInfo().getEncodedName() + " : " +
4347           "Received a flush event with unknown action, ignoring. " +
4348           TextFormat.shortDebugString(flush));
4349         break;
4350       }
4351
4352       logRegionFiles();
4353     } finally {
4354       closeRegionOperation(Operation.REPLAY_EVENT);
4355     }
4356   }
4357
4358   /** Replay the flush marker from primary region by creating a corresponding snapshot of
4359    * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
4360    * edit (because the events may be coming out of order).
4361    */
4362   @VisibleForTesting
4363   PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
4364     long flushSeqId = flush.getFlushSequenceNumber();
4365
4366     HashSet<Store> storesToFlush = new HashSet<Store>();
4367     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4368       byte[] family = storeFlush.getFamilyName().toByteArray();
4369       Store store = getStore(family);
4370       if (store == null) {
4371         LOG.warn(getRegionInfo().getEncodedName() + " : "
4372           + "Received a flush start marker from primary, but the family is not found. Ignoring"
4373           + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
4374         continue;
4375       }
4376       storesToFlush.add(store);
4377     }
4378
4379     MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
4380
4381     // we will use writestate as a coarse-grain lock for all the replay events
4382     // (flush, compaction, region open etc)
4383     synchronized (writestate) {
4384       try {
4385         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4386           LOG.warn(getRegionInfo().getEncodedName() + " : "
4387               + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4388               + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4389               + " of " + lastReplayedOpenRegionSeqId);
4390           return null;
4391         }
4392         if (numMutationsWithoutWAL.get() > 0) {
4393           numMutationsWithoutWAL.set(0);
4394           dataInMemoryWithoutWAL.set(0);
4395         }
4396
4397         if (!writestate.flushing) {
4398           // we do not have an active snapshot and corresponding this.prepareResult. This means
4399           // we can just snapshot our memstores and continue as normal.
4400
4401           // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
4402           PrepareFlushResult prepareResult = internalPrepareFlushCache(null,
4403             flushSeqId, storesToFlush, status, false);
4404           if (prepareResult.result == null) {
4405             // save the PrepareFlushResult so that we can use it later from commit flush
4406             this.writestate.flushing = true;
4407             this.prepareFlushResult = prepareResult;
4408             status.markComplete("Flush prepare successful");
4409             if (LOG.isDebugEnabled()) {
4410               LOG.debug(getRegionInfo().getEncodedName() + " : "
4411                   + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
4412             }
4413           } else {
4414             // special case empty memstore. We will still save the flush result in this case, since
4415             // our memstore ie empty, but the primary is still flushing
4416             if (prepareResult.getResult().getResult() ==
4417                   FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
4418               this.writestate.flushing = true;
4419               this.prepareFlushResult = prepareResult;
4420               if (LOG.isDebugEnabled()) {
4421                 LOG.debug(getRegionInfo().getEncodedName() + " : "
4422                   + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
4423               }
4424             }
4425             status.abort("Flush prepare failed with " + prepareResult.result);
4426             // nothing much to do. prepare flush failed because of some reason.
4427           }
4428           return prepareResult;
4429         } else {
4430           // we already have an active snapshot.
4431           if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
4432             // They define the same flush. Log and continue.
4433             LOG.warn(getRegionInfo().getEncodedName() + " : "
4434                 + "Received a flush prepare marker with the same seqId: " +
4435                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4436                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4437             // ignore
4438           } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
4439             // We received a flush with a smaller seqNum than what we have prepared. We can only
4440             // ignore this prepare flush request.
4441             LOG.warn(getRegionInfo().getEncodedName() + " : "
4442                 + "Received a flush prepare marker with a smaller seqId: " +
4443                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4444                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4445             // ignore
4446           } else {
4447             // We received a flush with a larger seqNum than what we have prepared
4448             LOG.warn(getRegionInfo().getEncodedName() + " : "
4449                 + "Received a flush prepare marker with a larger seqId: " +
4450                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4451                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4452             // We do not have multiple active snapshots in the memstore or a way to merge current
4453             // memstore snapshot with the contents and resnapshot for now. We cannot take
4454             // another snapshot and drop the previous one because that will cause temporary
4455             // data loss in the secondary. So we ignore this for now, deferring the resolution
4456             // to happen when we see the corresponding flush commit marker. If we have a memstore
4457             // snapshot with x, and later received another prepare snapshot with y (where x < y),
4458             // when we see flush commit for y, we will drop snapshot for x, and can also drop all
4459             // the memstore edits if everything in memstore is < y. This is the usual case for
4460             // RS crash + recovery where we might see consequtive prepare flush wal markers.
4461             // Otherwise, this will cause more memory to be used in secondary replica until a
4462             // further prapare + commit flush is seen and replayed.
4463           }
4464         }
4465       } finally {
4466         status.cleanup();
4467         writestate.notifyAll();
4468       }
4469     }
4470     return null;
4471   }
4472
4473   @VisibleForTesting
4474   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4475     justification="Intentional; post memstore flush")
4476   void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
4477     MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
4478
4479     // check whether we have the memstore snapshot with the corresponding seqId. Replay to
4480     // secondary region replicas are in order, except for when the region moves or then the
4481     // region server crashes. In those cases, we may receive replay requests out of order from
4482     // the original seqIds.
4483     synchronized (writestate) {
4484       try {
4485         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4486           LOG.warn(getRegionInfo().getEncodedName() + " : "
4487             + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4488             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4489             + " of " + lastReplayedOpenRegionSeqId);
4490           return;
4491         }
4492
4493         if (writestate.flushing) {
4494           PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
4495           if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
4496             if (LOG.isDebugEnabled()) {
4497               LOG.debug(getRegionInfo().getEncodedName() + " : "
4498                   + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4499                   + " and a previous prepared snapshot was found");
4500             }
4501             // This is the regular case where we received commit flush after prepare flush
4502             // corresponding to the same seqId.
4503             replayFlushInStores(flush, prepareFlushResult, true);
4504
4505             // Set down the memstore size by amount of flush.
4506             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4507
4508             this.prepareFlushResult = null;
4509             writestate.flushing = false;
4510           } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
4511             // This should not happen normally. However, lets be safe and guard against these cases
4512             // we received a flush commit with a smaller seqId than what we have prepared
4513             // we will pick the flush file up from this commit (if we have not seen it), but we
4514             // will not drop the memstore
4515             LOG.warn(getRegionInfo().getEncodedName() + " : "
4516                 + "Received a flush commit marker with smaller seqId: "
4517                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
4518                 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
4519                 +"  prepared memstore snapshot");
4520             replayFlushInStores(flush, prepareFlushResult, false);
4521
4522             // snapshot is not dropped, so memstore sizes should not be decremented
4523             // we still have the prepared snapshot, flushing should still be true
4524           } else {
4525             // This should not happen normally. However, lets be safe and guard against these cases
4526             // we received a flush commit with a larger seqId than what we have prepared
4527             // we will pick the flush file for this. We will also obtain the updates lock and
4528             // look for contents of the memstore to see whether we have edits after this seqId.
4529             // If not, we will drop all the memstore edits and the snapshot as well.
4530             LOG.warn(getRegionInfo().getEncodedName() + " : "
4531                 + "Received a flush commit marker with larger seqId: "
4532                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
4533                 prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
4534                 +" memstore snapshot");
4535
4536             replayFlushInStores(flush, prepareFlushResult, true);
4537
4538             // Set down the memstore size by amount of flush.
4539             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4540
4541             // Inspect the memstore contents to see whether the memstore contains only edits
4542             // with seqId smaller than the flush seqId. If so, we can discard those edits.
4543             dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4544
4545             this.prepareFlushResult = null;
4546             writestate.flushing = false;
4547           }
4548           // If we were waiting for observing a flush or region opening event for not showing
4549           // partial data after a secondary region crash, we can allow reads now. We can only make
4550           // sure that we are not showing partial data (for example skipping some previous edits)
4551           // until we observe a full flush start and flush commit. So if we were not able to find
4552           // a previous flush we will not enable reads now.
4553           this.setReadsEnabled(true);
4554         } else {
4555           LOG.warn(getRegionInfo().getEncodedName() + " : "
4556               + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4557               + ", but no previous prepared snapshot was found");
4558           // There is no corresponding prepare snapshot from before.
4559           // We will pick up the new flushed file
4560           replayFlushInStores(flush, null, false);
4561
4562           // Inspect the memstore contents to see whether the memstore contains only edits
4563           // with seqId smaller than the flush seqId. If so, we can discard those edits.
4564           dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4565         }
4566
4567         status.markComplete("Flush commit successful");
4568
4569         // Update the last flushed sequence id for region.
4570         this.maxFlushedSeqId = flush.getFlushSequenceNumber();
4571
4572         // advance the mvcc read point so that the new flushed file is visible.
4573         mvcc.advanceTo(flush.getFlushSequenceNumber());
4574
4575       } catch (FileNotFoundException ex) {
4576         LOG.warn(getRegionInfo().getEncodedName() + " : "
4577             + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
4578             + " doesn't exist any more. Skip loading the file(s)", ex);
4579       }
4580       finally {
4581         status.cleanup();
4582         writestate.notifyAll();
4583       }
4584     }
4585
4586     // C. Finally notify anyone waiting on memstore to clear:
4587     // e.g. checkResources().
4588     synchronized (this) {
4589       notifyAll(); // FindBugs NN_NAKED_NOTIFY
4590     }
4591   }
4592
4593   /**
4594    * Replays the given flush descriptor by opening the flush files in stores and dropping the
4595    * memstore snapshots if requested.
4596    * @param flush
4597    * @param prepareFlushResult
4598    * @param dropMemstoreSnapshot
4599    * @throws IOException
4600    */
4601   private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
4602       boolean dropMemstoreSnapshot)
4603       throws IOException {
4604     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4605       byte[] family = storeFlush.getFamilyName().toByteArray();
4606       Store store = getStore(family);
4607       if (store == null) {
4608         LOG.warn(getRegionInfo().getEncodedName() + " : "
4609             + "Received a flush commit marker from primary, but the family is not found."
4610             + "Ignoring StoreFlushDescriptor:" + storeFlush);
4611         continue;
4612       }
4613       List<String> flushFiles = storeFlush.getFlushOutputList();
4614       StoreFlushContext ctx = null;
4615       long startTime = EnvironmentEdgeManager.currentTime();
4616       if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
4617         ctx = store.createFlushContext(flush.getFlushSequenceNumber());
4618       } else {
4619         ctx = prepareFlushResult.storeFlushCtxs.get(family);
4620         startTime = prepareFlushResult.startTime;
4621       }
4622
4623       if (ctx == null) {
4624         LOG.warn(getRegionInfo().getEncodedName() + " : "
4625             + "Unexpected: flush commit marker received from store "
4626             + Bytes.toString(family) + " but no associated flush context. Ignoring");
4627         continue;
4628       }
4629
4630       ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
4631
4632       // Record latest flush time
4633       this.lastStoreFlushTimeMap.put(store, startTime);
4634     }
4635   }
4636
4637   /**
4638    * Drops the memstore contents after replaying a flush descriptor or region open event replay
4639    * if the memstore edits have seqNums smaller than the given seq id
4640    * @throws IOException
4641    */
4642   private long dropMemstoreContentsForSeqId(long seqId, Store store) throws IOException {
4643     long totalFreedSize = 0;
4644     this.updatesLock.writeLock().lock();
4645     try {
4646
4647       long currentSeqId = mvcc.getReadPoint();
4648       if (seqId >= currentSeqId) {
4649         // then we can drop the memstore contents since everything is below this seqId
4650         LOG.info(getRegionInfo().getEncodedName() + " : "
4651             + "Dropping memstore contents as well since replayed flush seqId: "
4652             + seqId + " is greater than current seqId:" + currentSeqId);
4653
4654         // Prepare flush (take a snapshot) and then abort (drop the snapshot)
4655         if (store == null) {
4656           for (Store s : stores.values()) {
4657             totalFreedSize += doDropStoreMemstoreContentsForSeqId(s, currentSeqId);
4658           }
4659         } else {
4660           totalFreedSize += doDropStoreMemstoreContentsForSeqId(store, currentSeqId);
4661         }
4662       } else {
4663         LOG.info(getRegionInfo().getEncodedName() + " : "
4664             + "Not dropping memstore contents since replayed flush seqId: "
4665             + seqId + " is smaller than current seqId:" + currentSeqId);
4666       }
4667     } finally {
4668       this.updatesLock.writeLock().unlock();
4669     }
4670     return totalFreedSize;
4671   }
4672
4673   private long doDropStoreMemstoreContentsForSeqId(Store s, long currentSeqId) throws IOException {
4674     long snapshotSize = s.getFlushableSize();
4675     this.addAndGetGlobalMemstoreSize(-snapshotSize);
4676     StoreFlushContext ctx = s.createFlushContext(currentSeqId);
4677     ctx.prepare();
4678     ctx.abort();
4679     return snapshotSize;
4680   }
4681
4682   private void replayWALFlushAbortMarker(FlushDescriptor flush) {
4683     // nothing to do for now. A flush abort will cause a RS abort which means that the region
4684     // will be opened somewhere else later. We will see the region open event soon, and replaying
4685     // that will drop the snapshot
4686   }
4687
4688   private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
4689     synchronized (writestate) {
4690       if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
4691         LOG.warn(getRegionInfo().getEncodedName() + " : "
4692           + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4693           + " because its sequence id " + replaySeqId + " is smaller than this regions "
4694           + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4695         return;
4696       }
4697
4698       // If we were waiting for observing a flush or region opening event for not showing partial
4699       // data after a secondary region crash, we can allow reads now. This event means that the
4700       // primary was not able to flush because memstore is empty when we requested flush. By the
4701       // time we observe this, we are guaranteed to have up to date seqId with our previous
4702       // assignment.
4703       this.setReadsEnabled(true);
4704     }
4705   }
4706
4707   @VisibleForTesting
4708   PrepareFlushResult getPrepareFlushResult() {
4709     return prepareFlushResult;
4710   }
4711
4712   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4713       justification="Intentional; cleared the memstore")
4714   void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
4715     checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
4716       "RegionEvent marker from WAL ", regionEvent);
4717
4718     startRegionOperation(Operation.REPLAY_EVENT);
4719     try {
4720       if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4721         return; // if primary nothing to do
4722       }
4723
4724       if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
4725         // nothing to do on REGION_CLOSE for now.
4726         return;
4727       }
4728       if (regionEvent.getEventType() != EventType.REGION_OPEN) {
4729         LOG.warn(getRegionInfo().getEncodedName() + " : "
4730             + "Unknown region event received, ignoring :"
4731             + TextFormat.shortDebugString(regionEvent));
4732         return;
4733       }
4734
4735       if (LOG.isDebugEnabled()) {
4736         LOG.debug(getRegionInfo().getEncodedName() + " : "
4737           + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
4738       }
4739
4740       // we will use writestate as a coarse-grain lock for all the replay events
4741       synchronized (writestate) {
4742         // Replication can deliver events out of order when primary region moves or the region
4743         // server crashes, since there is no coordination between replication of different wal files
4744         // belonging to different region servers. We have to safe guard against this case by using
4745         // region open event's seqid. Since this is the first event that the region puts (after
4746         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4747         // smaller than this seqId
4748         if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
4749           this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
4750         } else {
4751           LOG.warn(getRegionInfo().getEncodedName() + " : "
4752             + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
4753             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4754             + " of " + lastReplayedOpenRegionSeqId);
4755           return;
4756         }
4757
4758         // region open lists all the files that the region has at the time of the opening. Just pick
4759         // all the files and drop prepared flushes and empty memstores
4760         for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
4761           // stores of primary may be different now
4762           byte[] family = storeDescriptor.getFamilyName().toByteArray();
4763           Store store = getStore(family);
4764           if (store == null) {
4765             LOG.warn(getRegionInfo().getEncodedName() + " : "
4766                 + "Received a region open marker from primary, but the family is not found. "
4767                 + "Ignoring. StoreDescriptor:" + storeDescriptor);
4768             continue;
4769           }
4770
4771           long storeSeqId = store.getMaxSequenceId();
4772           List<String> storeFiles = storeDescriptor.getStoreFileList();
4773           try {
4774             store.refreshStoreFiles(storeFiles); // replace the files with the new ones
4775           } catch (FileNotFoundException ex) {
4776             LOG.warn(getRegionInfo().getEncodedName() + " : "
4777                     + "At least one of the store files: " + storeFiles
4778                     + " doesn't exist any more. Skip loading the file(s)", ex);
4779             continue;
4780           }
4781           if (store.getMaxSequenceId() != storeSeqId) {
4782             // Record latest flush time if we picked up new files
4783             lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
4784           }
4785
4786           if (writestate.flushing) {
4787             // only drop memstore snapshots if they are smaller than last flush for the store
4788             if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
4789               StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4790                   null : this.prepareFlushResult.storeFlushCtxs.get(family);
4791               if (ctx != null) {
4792                 long snapshotSize = store.getFlushableSize();
4793                 ctx.abort();
4794                 this.addAndGetGlobalMemstoreSize(-snapshotSize);
4795                 this.prepareFlushResult.storeFlushCtxs.remove(family);
4796               }
4797             }
4798           }
4799
4800           // Drop the memstore contents if they are now smaller than the latest seen flushed file
4801           dropMemstoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
4802           if (storeSeqId > this.maxFlushedSeqId) {
4803             this.maxFlushedSeqId = storeSeqId;
4804           }
4805         }
4806
4807         // if all stores ended up dropping their snapshots, we can safely drop the
4808         // prepareFlushResult
4809         dropPrepareFlushIfPossible();
4810
4811         // advance the mvcc read point so that the new flushed file is visible.
4812         mvcc.await();
4813
4814         // If we were waiting for observing a flush or region opening event for not showing partial
4815         // data after a secondary region crash, we can allow reads now.
4816         this.setReadsEnabled(true);
4817
4818         // C. Finally notify anyone waiting on memstore to clear:
4819         // e.g. checkResources().
4820         synchronized (this) {
4821           notifyAll(); // FindBugs NN_NAKED_NOTIFY
4822         }
4823       }
4824       logRegionFiles();
4825     } finally {
4826       closeRegionOperation(Operation.REPLAY_EVENT);
4827     }
4828   }
4829
4830   void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
4831     checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
4832       "BulkLoad marker from WAL ", bulkLoadEvent);
4833
4834     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4835       return; // if primary nothing to do
4836     }
4837
4838     if (LOG.isDebugEnabled()) {
4839       LOG.debug(getRegionInfo().getEncodedName() + " : "
4840               +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
4841     }
4842     // check if multiple families involved
4843     boolean multipleFamilies = false;
4844     byte[] family = null;
4845     for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4846       byte[] fam = storeDescriptor.getFamilyName().toByteArray();
4847       if (family == null) {
4848         family = fam;
4849       } else if (!Bytes.equals(family, fam)) {
4850         multipleFamilies = true;
4851         break;
4852       }
4853     }
4854
4855     startBulkRegionOperation(multipleFamilies);
4856     try {
4857       // we will use writestate as a coarse-grain lock for all the replay events
4858       synchronized (writestate) {
4859         // Replication can deliver events out of order when primary region moves or the region
4860         // server crashes, since there is no coordination between replication of different wal files
4861         // belonging to different region servers. We have to safe guard against this case by using
4862         // region open event's seqid. Since this is the first event that the region puts (after
4863         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4864         // smaller than this seqId
4865         if (bulkLoadEvent.getBulkloadSeqNum() >= 0
4866             && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
4867           LOG.warn(getRegionInfo().getEncodedName() + " : "
4868               + "Skipping replaying bulkload event :"
4869               + TextFormat.shortDebugString(bulkLoadEvent)
4870               + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
4871               + " =" + lastReplayedOpenRegionSeqId);
4872
4873           return;
4874         }
4875
4876         for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4877           // stores of primary may be different now
4878           family = storeDescriptor.getFamilyName().toByteArray();
4879           Store store = getStore(family);
4880           if (store == null) {
4881             LOG.warn(getRegionInfo().getEncodedName() + " : "
4882                     + "Received a bulk load marker from primary, but the family is not found. "
4883                     + "Ignoring. StoreDescriptor:" + storeDescriptor);
4884             continue;
4885           }
4886
4887           List<String> storeFiles = storeDescriptor.getStoreFileList();
4888           for (String storeFile : storeFiles) {
4889             StoreFileInfo storeFileInfo = null;
4890             try {
4891               storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
4892               store.bulkLoadHFile(storeFileInfo);
4893             } catch(FileNotFoundException ex) {
4894               LOG.warn(getRegionInfo().getEncodedName() + " : "
4895                       + ((storeFileInfo != null) ? storeFileInfo.toString() :
4896                             (new Path(Bytes.toString(family), storeFile)).toString())
4897                       + " doesn't exist any more. Skip loading the file");
4898             }
4899           }
4900         }
4901       }
4902       if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
4903         mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum());
4904       }
4905     } finally {
4906       closeBulkRegionOperation();
4907     }
4908   }
4909
4910   /**
4911    * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
4912    */
4913   private void dropPrepareFlushIfPossible() {
4914     if (writestate.flushing) {
4915       boolean canDrop = true;
4916       if (prepareFlushResult.storeFlushCtxs != null) {
4917         for (Entry<byte[], StoreFlushContext> entry
4918             : prepareFlushResult.storeFlushCtxs.entrySet()) {
4919           Store store = getStore(entry.getKey());
4920           if (store == null) {
4921             continue;
4922           }
4923           if (store.getSnapshotSize() > 0) {
4924             canDrop = false;
4925             break;
4926           }
4927         }
4928       }
4929
4930       // this means that all the stores in the region has finished flushing, but the WAL marker
4931       // may not have been written or we did not receive it yet.
4932       if (canDrop) {
4933         writestate.flushing = false;
4934         this.prepareFlushResult = null;
4935       }
4936     }
4937   }
4938
4939   @Override
4940   public boolean refreshStoreFiles() throws IOException {
4941     return refreshStoreFiles(false);
4942   }
4943
4944   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4945       justification="Notify is about post replay. Intentional")
4946   protected boolean refreshStoreFiles(boolean force) throws IOException {
4947     if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4948       return false; // if primary nothing to do
4949     }
4950
4951     if (LOG.isDebugEnabled()) {
4952       LOG.debug(getRegionInfo().getEncodedName() + " : "
4953           + "Refreshing store files to see whether we can free up memstore");
4954     }
4955
4956     long totalFreedSize = 0;
4957
4958     long smallestSeqIdInStores = Long.MAX_VALUE;
4959
4960     startRegionOperation(); // obtain region close lock
4961     try {
4962       synchronized (writestate) {
4963         for (Store store : getStores()) {
4964           // TODO: some stores might see new data from flush, while others do not which
4965           // MIGHT break atomic edits across column families.
4966           long maxSeqIdBefore = store.getMaxSequenceId();
4967
4968           // refresh the store files. This is similar to observing a region open wal marker.
4969           store.refreshStoreFiles();
4970
4971           long storeSeqId = store.getMaxSequenceId();
4972           if (storeSeqId < smallestSeqIdInStores) {
4973             smallestSeqIdInStores = storeSeqId;
4974           }
4975
4976           // see whether we can drop the memstore or the snapshot
4977           if (storeSeqId > maxSeqIdBefore) {
4978
4979             if (writestate.flushing) {
4980               // only drop memstore snapshots if they are smaller than last flush for the store
4981               if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
4982                 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4983                     null : this.prepareFlushResult.storeFlushCtxs.get(store.getFamily().getName());
4984                 if (ctx != null) {
4985                   long snapshotSize = store.getFlushableSize();
4986                   ctx.abort();
4987                   this.addAndGetGlobalMemstoreSize(-snapshotSize);
4988                   this.prepareFlushResult.storeFlushCtxs.remove(store.getFamily().getName());
4989                   totalFreedSize += snapshotSize;
4990                 }
4991               }
4992             }
4993
4994             // Drop the memstore contents if they are now smaller than the latest seen flushed file
4995             totalFreedSize += dropMemstoreContentsForSeqId(storeSeqId, store);
4996           }
4997         }
4998
4999         // if all stores ended up dropping their snapshots, we can safely drop the
5000         // prepareFlushResult
5001         dropPrepareFlushIfPossible();
5002
5003         // advance the mvcc read point so that the new flushed files are visible.
5004           // either greater than flush seq number or they were already picked up via flush.
5005           for (Store s : getStores()) {
5006             mvcc.advanceTo(s.getMaxMemstoreTS());
5007           }
5008
5009
5010         // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
5011         // skip all edits that are to be replayed in the future with that has a smaller seqId
5012         // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
5013         // that we have picked the flush files for
5014         if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
5015           this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
5016         }
5017       }
5018       // C. Finally notify anyone waiting on memstore to clear:
5019       // e.g. checkResources().
5020       synchronized (this) {
5021         notifyAll(); // FindBugs NN_NAKED_NOTIFY
5022       }
5023       return totalFreedSize > 0;
5024     } finally {
5025       closeRegionOperation();
5026     }
5027   }
5028
5029   private void logRegionFiles() {
5030     if (LOG.isTraceEnabled()) {
5031       LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
5032       for (Store s : stores.values()) {
5033         Collection<StoreFile> storeFiles = s.getStorefiles();
5034         if (storeFiles == null) continue;
5035         for (StoreFile sf : storeFiles) {
5036           LOG.trace(getRegionInfo().getEncodedName() + " : " + sf);
5037         }
5038       }
5039     }
5040   }
5041
5042   /** Checks whether the given regionName is either equal to our region, or that
5043    * the regionName is the primary region to our corresponding range for the secondary replica.
5044    */
5045   private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
5046       throws WrongRegionException {
5047     if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
5048       return;
5049     }
5050
5051     if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
5052         Bytes.equals(encodedRegionName,
5053           this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
5054       return;
5055     }
5056
5057     throw new WrongRegionException(exceptionMsg + payload
5058       + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
5059       + " does not match this region: " + this.getRegionInfo());
5060   }
5061
5062   /**
5063    * Used by tests
5064    * @param s Store to add edit too.
5065    * @param cell Cell to add.
5066    * @return True if we should flush.
5067    */
5068   protected boolean restoreEdit(final Store s, final Cell cell) {
5069     long kvSize = s.add(cell);
5070     if (this.rsAccounting != null) {
5071       rsAccounting.addAndGetRegionReplayEditsSize(getRegionInfo().getRegionName(), kvSize);
5072     }
5073     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
5074   }
5075
5076   /*
5077    * @param fs
5078    * @param p File to check.
5079    * @return True if file was zero-length (and if so, we'll delete it in here).
5080    * @throws IOException
5081    */
5082   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
5083       throws IOException {
5084     FileStatus stat = fs.getFileStatus(p);
5085     if (stat.getLen() > 0) return false;
5086     LOG.warn("File " + p + " is zero-length, deleting.");
5087     fs.delete(p, false);
5088     return true;
5089   }
5090
5091   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
5092     if (family.isMobEnabled()) {
5093       if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
5094         throw new IOException("A minimum HFile version of "
5095             + HFile.MIN_FORMAT_VERSION_WITH_TAGS
5096             + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY
5097             + " accordingly.");
5098       }
5099       return new HMobStore(this, family, this.conf);
5100     }
5101     return new HStore(this, family, this.conf);
5102   }
5103
5104   @Override
5105   public Store getStore(final byte[] column) {
5106     return this.stores.get(column);
5107   }
5108
5109   /**
5110    * Return HStore instance. Does not do any copy: as the number of store is limited, we
5111    *  iterate on the list.
5112    */
5113   private Store getStore(Cell cell) {
5114     for (Map.Entry<byte[], Store> famStore : stores.entrySet()) {
5115       if (Bytes.equals(
5116           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
5117           famStore.getKey(), 0, famStore.getKey().length)) {
5118         return famStore.getValue();
5119       }
5120     }
5121
5122     return null;
5123   }
5124
5125   @Override
5126   public List<Store> getStores() {
5127     List<Store> list = new ArrayList<Store>(stores.size());
5128     list.addAll(stores.values());
5129     return list;
5130   }
5131
5132   @Override
5133   public List<String> getStoreFileList(final byte [][] columns)
5134     throws IllegalArgumentException {
5135     List<String> storeFileNames = new ArrayList<String>();
5136     synchronized(closeLock) {
5137       for(byte[] column : columns) {
5138         Store store = this.stores.get(column);
5139         if (store == null) {
5140           throw new IllegalArgumentException("No column family : " +
5141               new String(column) + " available");
5142         }
5143         Collection<StoreFile> storeFiles = store.getStorefiles();
5144         if (storeFiles == null) continue;
5145         for (StoreFile storeFile: storeFiles) {
5146           storeFileNames.add(storeFile.getPath().toString());
5147         }
5148
5149         logRegionFiles();
5150       }
5151     }
5152     return storeFileNames;
5153   }
5154
5155   //////////////////////////////////////////////////////////////////////////////
5156   // Support code
5157   //////////////////////////////////////////////////////////////////////////////
5158
5159   /** Make sure this is a valid row for the HRegion */
5160   void checkRow(final byte [] row, String op) throws IOException {
5161     if (!rowIsInRange(getRegionInfo(), row)) {
5162       throw new WrongRegionException("Requested row out of range for " +
5163           op + " on HRegion " + this + ", startKey='" +
5164           Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" +
5165           Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" +
5166           Bytes.toStringBinary(row) + "'");
5167     }
5168   }
5169
5170
5171   /**
5172    * Get an exclusive ( write lock ) lock on a given row.
5173    * @param row Which row to lock.
5174    * @return A locked RowLock. The lock is exclusive and already aqquired.
5175    * @throws IOException
5176    */
5177   public RowLock getRowLock(byte[] row) throws IOException {
5178     return getRowLock(row, false);
5179   }
5180
5181   /**
5182    *
5183    * Get a row lock for the specified row. All locks are reentrant.
5184    *
5185    * Before calling this function make sure that a region operation has already been
5186    * started (the calling thread has already acquired the region-close-guard lock).
5187    * @param row The row actions will be performed against
5188    * @param readLock is the lock reader or writer. True indicates that a non-exlcusive
5189    *                 lock is requested
5190    */
5191   @Override
5192   public RowLock getRowLock(byte[] row, boolean readLock) throws IOException {
5193     checkRow(row, "row lock");
5194     return getRowLockInternal(row, readLock);
5195   }
5196
5197   protected RowLock getRowLockInternal(byte[] row, boolean readLock) throws IOException {
5198     // create an object to use a a key in the row lock map
5199     HashedBytes rowKey = new HashedBytes(row);
5200
5201     RowLockContext rowLockContext = null;
5202     RowLockImpl result = null;
5203     TraceScope traceScope = null;
5204
5205     // If we're tracing start a span to show how long this took.
5206     if (Trace.isTracing()) {
5207       traceScope = Trace.startSpan("HRegion.getRowLock");
5208       traceScope.getSpan().addTimelineAnnotation("Getting a " + (readLock?"readLock":"writeLock"));
5209     }
5210
5211     try {
5212       // Keep trying until we have a lock or error out.
5213       // TODO: do we need to add a time component here?
5214       while (result == null) {
5215
5216         // Try adding a RowLockContext to the lockedRows.
5217         // If we can add it then there's no other transactions currently running.
5218         rowLockContext = new RowLockContext(rowKey);
5219         RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
5220
5221         // if there was a running transaction then there's already a context.
5222         if (existingContext != null) {
5223           rowLockContext = existingContext;
5224         }
5225
5226         // Now try an get the lock.
5227         //
5228         // This can fail as
5229         if (readLock) {
5230           result = rowLockContext.newReadLock();
5231         } else {
5232           result = rowLockContext.newWriteLock();
5233         }
5234       }
5235       if (!result.getLock().tryLock(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
5236         if (traceScope != null) {
5237           traceScope.getSpan().addTimelineAnnotation("Failed to get row lock");
5238         }
5239         result = null;
5240         // Clean up the counts just in case this was the thing keeping the context alive.
5241         rowLockContext.cleanUp();
5242         throw new IOException("Timed out waiting for lock for row: " + rowKey);
5243       }
5244       rowLockContext.setThreadName(Thread.currentThread().getName());
5245       return result;
5246     } catch (InterruptedException ie) {
5247       LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
5248       InterruptedIOException iie = new InterruptedIOException();
5249       iie.initCause(ie);
5250       if (traceScope != null) {
5251         traceScope.getSpan().addTimelineAnnotation("Interrupted exception getting row lock");
5252       }
5253       Thread.currentThread().interrupt();
5254       throw iie;
5255     } finally {
5256       if (traceScope != null) {
5257         traceScope.close();
5258       }
5259     }
5260   }
5261
5262   @Override
5263   public void releaseRowLocks(List<RowLock> rowLocks) {
5264     if (rowLocks != null) {
5265       for (int i = 0; i < rowLocks.size(); i++) {
5266         rowLocks.get(i).release();
5267       }
5268       rowLocks.clear();
5269     }
5270   }
5271
5272   public ConcurrentHashMap<HashedBytes, RowLockContext> getLockedRows() {
5273     return lockedRows;
5274   }
5275
5276   @VisibleForTesting
5277   class RowLockContext {
5278     private final HashedBytes row;
5279     final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true);
5280     final AtomicBoolean usable = new AtomicBoolean(true);
5281     final AtomicInteger count = new AtomicInteger(0);
5282     final Object lock = new Object();
5283     private String threadName;
5284
5285     RowLockContext(HashedBytes row) {
5286       this.row = row;
5287     }
5288
5289     RowLockImpl newWriteLock() {
5290       Lock l = readWriteLock.writeLock();
5291       return getRowLock(l);
5292     }
5293     RowLockImpl newReadLock() {
5294       Lock l = readWriteLock.readLock();
5295       return getRowLock(l);
5296     }
5297
5298     private RowLockImpl getRowLock(Lock l) {
5299       count.incrementAndGet();
5300       synchronized (lock) {
5301         if (usable.get()) {
5302           return new RowLockImpl(this, l);
5303         } else {
5304           return null;
5305         }
5306       }
5307     }
5308
5309     void cleanUp() {
5310       long c = count.decrementAndGet();
5311       if (c <= 0) {
5312         synchronized (lock) {
5313           if (count.get() <= 0){
5314             usable.set(false);
5315             RowLockContext removed = lockedRows.remove(row);
5316             assert removed == this: "we should never remove a different context";
5317           }
5318         }
5319       }
5320     }
5321
5322     public void setThreadName(String threadName) {
5323       this.threadName = threadName;
5324     }
5325
5326     @Override
5327     public String toString() {
5328       return "RowLockContext{" +
5329           "row=" + row +
5330           ", readWriteLock=" + readWriteLock +
5331           ", count=" + count +
5332           ", threadName=" + threadName +
5333           '}';
5334     }
5335   }
5336
5337   /**
5338    * Class used to represent a lock on a row.
5339    */
5340   public static class RowLockImpl implements RowLock {
5341     private final RowLockContext context;
5342     private final Lock lock;
5343
5344     public RowLockImpl(RowLockContext context, Lock lock) {
5345       this.context = context;
5346       this.lock = lock;
5347     }
5348
5349     public Lock getLock() {
5350       return lock;
5351     }
5352
5353     @VisibleForTesting
5354     public RowLockContext getContext() {
5355       return context;
5356     }
5357
5358     @Override
5359     public void release() {
5360       lock.unlock();
5361       context.cleanUp();
5362     }
5363
5364     @Override
5365     public String toString() {
5366       return "RowLockImpl{" +
5367           "context=" + context +
5368           ", lock=" + lock +
5369           '}';
5370     }
5371   }
5372
5373   /**
5374    * Determines whether multiple column families are present
5375    * Precondition: familyPaths is not null
5376    *
5377    * @param familyPaths List of (column family, hfilePath)
5378    */
5379   private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) {
5380     boolean multipleFamilies = false;
5381     byte[] family = null;
5382     for (Pair<byte[], String> pair : familyPaths) {
5383       byte[] fam = pair.getFirst();
5384       if (family == null) {
5385         family = fam;
5386       } else if (!Bytes.equals(family, fam)) {
5387         multipleFamilies = true;
5388         break;
5389       }
5390     }
5391     return multipleFamilies;
5392   }
5393
5394   @Override
5395   public boolean bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, boolean assignSeqId,
5396       BulkLoadListener bulkLoadListener) throws IOException {
5397     long seqId = -1;
5398     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
5399     Map<String, Long> storeFilesSizes = new HashMap<String, Long>();
5400     Preconditions.checkNotNull(familyPaths);
5401     // we need writeLock for multi-family bulk load
5402     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
5403     boolean isSuccessful = false;
5404     try {
5405       this.writeRequestsCount.increment();
5406
5407       // There possibly was a split that happened between when the split keys
5408       // were gathered and before the HRegion's write lock was taken.  We need
5409       // to validate the HFile region before attempting to bulk load all of them
5410       List<IOException> ioes = new ArrayList<IOException>();
5411       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
5412       for (Pair<byte[], String> p : familyPaths) {
5413         byte[] familyName = p.getFirst();
5414         String path = p.getSecond();
5415
5416         Store store = getStore(familyName);
5417         if (store == null) {
5418           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
5419               "No such column family " + Bytes.toStringBinary(familyName));
5420           ioes.add(ioe);
5421         } else {
5422           try {
5423             store.assertBulkLoadHFileOk(new Path(path));
5424           } catch (WrongRegionException wre) {
5425             // recoverable (file doesn't fit in region)
5426             failures.add(p);
5427           } catch (IOException ioe) {
5428             // unrecoverable (hdfs problem)
5429             ioes.add(ioe);
5430           }
5431         }
5432       }
5433
5434       // validation failed because of some sort of IO problem.
5435       if (ioes.size() != 0) {
5436         IOException e = MultipleIOException.createIOException(ioes);
5437         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
5438         throw e;
5439       }
5440
5441       // validation failed, bail out before doing anything permanent.
5442       if (failures.size() != 0) {
5443         StringBuilder list = new StringBuilder();
5444         for (Pair<byte[], String> p : failures) {
5445           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
5446               .append(p.getSecond());
5447         }
5448         // problem when validating
5449         LOG.warn("There was a recoverable bulk load failure likely due to a" +
5450             " split.  These (family, HFile) pairs were not loaded: " + list);
5451         return isSuccessful;
5452       }
5453
5454       // We need to assign a sequential ID that's in between two memstores in order to preserve
5455       // the guarantee that all the edits lower than the highest sequential ID from all the
5456       // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
5457       // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
5458       // a sequence id that we can be sure is beyond the last hfile written).
5459       if (assignSeqId) {
5460         FlushResult fs = flushcache(true, false);
5461         if (fs.isFlushSucceeded()) {
5462           seqId = ((FlushResultImpl)fs).flushSequenceId;
5463         } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
5464           seqId = ((FlushResultImpl)fs).flushSequenceId;
5465         } else {
5466           throw new IOException("Could not bulk load with an assigned sequential ID because the "+
5467             "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason);
5468         }
5469       }
5470
5471       for (Pair<byte[], String> p : familyPaths) {
5472         byte[] familyName = p.getFirst();
5473         String path = p.getSecond();
5474         Store store = getStore(familyName);
5475         try {
5476           String finalPath = path;
5477           if (bulkLoadListener != null) {
5478             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
5479           }
5480           Path commitedStoreFile = store.bulkLoadHFile(finalPath, seqId);
5481
5482           // Note the size of the store file
5483           try {
5484             FileSystem fs = commitedStoreFile.getFileSystem(baseConf);
5485             storeFilesSizes.put(commitedStoreFile.getName(), fs.getFileStatus(commitedStoreFile)
5486                 .getLen());
5487           } catch (IOException e) {
5488             LOG.warn("Failed to find the size of hfile " + commitedStoreFile);
5489             storeFilesSizes.put(commitedStoreFile.getName(), 0L);
5490           }
5491
5492           if(storeFiles.containsKey(familyName)) {
5493             storeFiles.get(familyName).add(commitedStoreFile);
5494           } else {
5495             List<Path> storeFileNames = new ArrayList<Path>();
5496             storeFileNames.add(commitedStoreFile);
5497             storeFiles.put(familyName, storeFileNames);
5498           }
5499           if (bulkLoadListener != null) {
5500             bulkLoadListener.doneBulkLoad(familyName, path);
5501           }
5502         } catch (IOException ioe) {
5503           // A failure here can cause an atomicity violation that we currently
5504           // cannot recover from since it is likely a failed HDFS operation.
5505
5506           // TODO Need a better story for reverting partial failures due to HDFS.
5507           LOG.error("There was a partial failure due to IO when attempting to" +
5508               " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
5509           if (bulkLoadListener != null) {
5510             try {
5511               bulkLoadListener.failedBulkLoad(familyName, path);
5512             } catch (Exception ex) {
5513               LOG.error("Error while calling failedBulkLoad for family " +
5514                   Bytes.toString(familyName) + " with path " + path, ex);
5515             }
5516           }
5517           throw ioe;
5518         }
5519       }
5520
5521       isSuccessful = true;
5522     } finally {
5523       if (wal != null && !storeFiles.isEmpty()) {
5524         // Write a bulk load event for hfiles that are loaded
5525         try {
5526           WALProtos.BulkLoadDescriptor loadDescriptor =
5527               ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(),
5528                 ByteStringer.wrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles,
5529                 storeFilesSizes, seqId);
5530           WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(),
5531               loadDescriptor, mvcc);
5532         } catch (IOException ioe) {
5533           if (this.rsServices != null) {
5534             // Have to abort region server because some hfiles has been loaded but we can't write
5535             // the event into WAL
5536             isSuccessful = false;
5537             this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
5538           }
5539         }
5540       }
5541
5542       closeBulkRegionOperation();
5543     }
5544     return isSuccessful;
5545   }
5546
5547   @Override
5548   public boolean equals(Object o) {
5549     return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
5550                                                 ((HRegion) o).getRegionInfo().getRegionName());
5551   }
5552
5553   @Override
5554   public int hashCode() {
5555     return Bytes.hashCode(getRegionInfo().getRegionName());
5556   }
5557
5558   @Override
5559   public String toString() {
5560     return getRegionInfo().getRegionNameAsString();
5561   }
5562
5563   /**
5564    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
5565    */
5566   class RegionScannerImpl implements RegionScanner, org.apache.hadoop.hbase.ipc.RpcCallback {
5567     // Package local for testability
5568     KeyValueHeap storeHeap = null;
5569     /** Heap of key-values that are not essential for the provided filters and are thus read
5570      * on demand, if on-demand column family loading is enabled.*/
5571     KeyValueHeap joinedHeap = null;
5572     /**
5573      * If the joined heap data gathering is interrupted due to scan limits, this will
5574      * contain the row for which we are populating the values.*/
5575     protected Cell joinedContinuationRow = null;
5576     private boolean filterClosed = false;
5577
5578     protected final int isScan;
5579     protected final byte[] stopRow;
5580     protected final HRegion region;
5581     protected final CellComparator comparator;
5582
5583     private final long readPt;
5584     private final long maxResultSize;
5585     private final ScannerContext defaultScannerContext;
5586     private final FilterWrapper filter;
5587
5588     @Override
5589     public HRegionInfo getRegionInfo() {
5590       return region.getRegionInfo();
5591     }
5592
5593     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
5594         throws IOException {
5595       this.region = region;
5596       this.maxResultSize = scan.getMaxResultSize();
5597       if (scan.hasFilter()) {
5598         this.filter = new FilterWrapper(scan.getFilter());
5599       } else {
5600         this.filter = null;
5601       }
5602       this.comparator = region.getCellCompartor();
5603       /**
5604        * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default
5605        * scanner context that can be used to enforce the batch limit in the event that a
5606        * ScannerContext is not specified during an invocation of next/nextRaw
5607        */
5608       defaultScannerContext = ScannerContext.newBuilder()
5609           .setBatchLimit(scan.getBatch()).build();
5610
5611       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
5612         this.stopRow = null;
5613       } else {
5614         this.stopRow = scan.getStopRow();
5615       }
5616       // If we are doing a get, we want to be [startRow,endRow]. Normally
5617       // it is [startRow,endRow) and if startRow=endRow we get nothing.
5618       this.isScan = scan.isGetScan() ? 1 : 0;
5619
5620       // synchronize on scannerReadPoints so that nobody calculates
5621       // getSmallestReadPoint, before scannerReadPoints is updated.
5622       IsolationLevel isolationLevel = scan.getIsolationLevel();
5623       synchronized(scannerReadPoints) {
5624         this.readPt = getReadPoint(isolationLevel);
5625         scannerReadPoints.put(this, this.readPt);
5626       }
5627
5628       // Here we separate all scanners into two lists - scanner that provide data required
5629       // by the filter to operate (scanners list) and all others (joinedScanners list).
5630       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5631       List<KeyValueScanner> joinedScanners
5632         = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5633       // Store all already instantiated scanners for exception handling
5634       List<KeyValueScanner> instantiatedScanners = new ArrayList<KeyValueScanner>();
5635       // handle additionalScanners
5636       if (additionalScanners != null && !additionalScanners.isEmpty()) {
5637         scanners.addAll(additionalScanners);
5638         instantiatedScanners.addAll(additionalScanners);
5639       }
5640
5641       try {
5642         for (Map.Entry<byte[], NavigableSet<byte[]>> entry : scan.getFamilyMap().entrySet()) {
5643           Store store = stores.get(entry.getKey());
5644           KeyValueScanner scanner;
5645           try {
5646             scanner = store.getScanner(scan, entry.getValue(), this.readPt);
5647           } catch (FileNotFoundException e) {
5648             throw handleFileNotFound(e);
5649           }
5650           instantiatedScanners.add(scanner);
5651           if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
5652               || this.filter.isFamilyEssential(entry.getKey())) {
5653             scanners.add(scanner);
5654           } else {
5655             joinedScanners.add(scanner);
5656           }
5657         }
5658         initializeKVHeap(scanners, joinedScanners, region);
5659       } catch (Throwable t) {
5660         throw handleException(instantiatedScanners, t);
5661       }
5662     }
5663
5664     protected void initializeKVHeap(List<KeyValueScanner> scanners,
5665         List<KeyValueScanner> joinedScanners, HRegion region)
5666         throws IOException {
5667       this.storeHeap = new KeyValueHeap(scanners, comparator);
5668       if (!joinedScanners.isEmpty()) {
5669         this.joinedHeap = new KeyValueHeap(joinedScanners, comparator);
5670       }
5671     }
5672
5673     private IOException handleException(List<KeyValueScanner> instantiatedScanners,
5674         Throwable t) {
5675       // remove scaner read point before throw the exception
5676       scannerReadPoints.remove(this);
5677       if (storeHeap != null) {
5678         storeHeap.close();
5679         storeHeap = null;
5680         if (joinedHeap != null) {
5681           joinedHeap.close();
5682           joinedHeap = null;
5683         }
5684       } else {
5685         // close all already instantiated scanners before throwing the exception
5686         for (KeyValueScanner scanner : instantiatedScanners) {
5687           scanner.close();
5688         }
5689       }
5690       return t instanceof IOException ? (IOException) t : new IOException(t);
5691     }
5692
5693     @Override
5694     public long getMaxResultSize() {
5695       return maxResultSize;
5696     }
5697
5698     @Override
5699     public long getMvccReadPoint() {
5700       return this.readPt;
5701     }
5702
5703     @Override
5704     public int getBatch() {
5705       return this.defaultScannerContext.getBatchLimit();
5706     }
5707
5708     /**
5709      * Reset both the filter and the old filter.
5710      *
5711      * @throws IOException in case a filter raises an I/O exception.
5712      */
5713     protected void resetFilters() throws IOException {
5714       if (filter != null) {
5715         filter.reset();
5716       }
5717     }
5718
5719     @Override
5720     public boolean next(List<Cell> outResults)
5721         throws IOException {
5722       // apply the batching limit by default
5723       return next(outResults, defaultScannerContext);
5724     }
5725
5726     @Override
5727     public synchronized boolean next(List<Cell> outResults, ScannerContext scannerContext)
5728     throws IOException {
5729       if (this.filterClosed) {
5730         throw new UnknownScannerException("Scanner was closed (timed out?) " +
5731             "after we renewed it. Could be caused by a very slow scanner " +
5732             "or a lengthy garbage collection");
5733       }
5734       startRegionOperation(Operation.SCAN);
5735       readRequestsCount.increment();
5736       try {
5737         return nextRaw(outResults, scannerContext);
5738       } finally {
5739         closeRegionOperation(Operation.SCAN);
5740       }
5741     }
5742
5743     @Override
5744     public boolean nextRaw(List<Cell> outResults) throws IOException {
5745       // Use the RegionScanner's context by default
5746       return nextRaw(outResults, defaultScannerContext);
5747     }
5748
5749     @Override
5750     public boolean nextRaw(List<Cell> outResults, ScannerContext scannerContext)
5751         throws IOException {
5752       if (storeHeap == null) {
5753         // scanner is closed
5754         throw new UnknownScannerException("Scanner was closed");
5755       }
5756       boolean moreValues = false;
5757       if (outResults.isEmpty()) {
5758         // Usually outResults is empty. This is true when next is called
5759         // to handle scan or get operation.
5760         moreValues = nextInternal(outResults, scannerContext);
5761       } else {
5762         List<Cell> tmpList = new ArrayList<Cell>();
5763         moreValues = nextInternal(tmpList, scannerContext);
5764         outResults.addAll(tmpList);
5765       }
5766
5767       // If the size limit was reached it means a partial Result is being
5768       // returned. Returning a
5769       // partial Result means that we should not reset the filters; filters
5770       // should only be reset in
5771       // between rows
5772       if (!scannerContext.midRowResultFormed())
5773         resetFilters();
5774
5775       if (isFilterDoneInternal()) {
5776         moreValues = false;
5777       }
5778       return moreValues;
5779     }
5780
5781     /**
5782      * @return true if more cells exist after this batch, false if scanner is done
5783      */
5784     private boolean populateFromJoinedHeap(List<Cell> results, ScannerContext scannerContext)
5785             throws IOException {
5786       assert joinedContinuationRow != null;
5787       boolean moreValues = populateResult(results, this.joinedHeap, scannerContext,
5788           joinedContinuationRow);
5789
5790       if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5791         // We are done with this row, reset the continuation.
5792         joinedContinuationRow = null;
5793       }
5794       // As the data is obtained from two independent heaps, we need to
5795       // ensure that result list is sorted, because Result relies on that.
5796       sort(results, comparator);
5797       return moreValues;
5798     }
5799
5800     /**
5801      * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
5802      * reached, or remainingResultSize (if not -1) is reaced
5803      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
5804      * @param scannerContext
5805      * @param currentRowCell
5806      * @return state of last call to {@link KeyValueHeap#next()}
5807      */
5808     private boolean populateResult(List<Cell> results, KeyValueHeap heap,
5809         ScannerContext scannerContext, Cell currentRowCell) throws IOException {
5810       Cell nextKv;
5811       boolean moreCellsInRow = false;
5812       boolean tmpKeepProgress = scannerContext.getKeepProgress();
5813       // Scanning between column families and thus the scope is between cells
5814       LimitScope limitScope = LimitScope.BETWEEN_CELLS;
5815       try {
5816         do {
5817           // We want to maintain any progress that is made towards the limits while scanning across
5818           // different column families. To do this, we toggle the keep progress flag on during calls
5819           // to the StoreScanner to ensure that any progress made thus far is not wiped away.
5820           scannerContext.setKeepProgress(true);
5821           heap.next(results, scannerContext);
5822           scannerContext.setKeepProgress(tmpKeepProgress);
5823
5824           nextKv = heap.peek();
5825           moreCellsInRow = moreCellsInRow(nextKv, currentRowCell);
5826           if (!moreCellsInRow) incrementCountOfRowsScannedMetric(scannerContext);
5827           if (moreCellsInRow && scannerContext.checkBatchLimit(limitScope)) {
5828             return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues();
5829           } else if (scannerContext.checkSizeLimit(limitScope)) {
5830             ScannerContext.NextState state =
5831               moreCellsInRow? NextState.SIZE_LIMIT_REACHED_MID_ROW: NextState.SIZE_LIMIT_REACHED;
5832             return scannerContext.setScannerState(state).hasMoreValues();
5833           } else if (scannerContext.checkTimeLimit(limitScope)) {
5834             ScannerContext.NextState state =
5835               moreCellsInRow? NextState.TIME_LIMIT_REACHED_MID_ROW: NextState.TIME_LIMIT_REACHED;
5836             return scannerContext.setScannerState(state).hasMoreValues();
5837           }
5838         } while (moreCellsInRow);
5839       } catch (FileNotFoundException e) {
5840         throw handleFileNotFound(e);
5841       }
5842       return nextKv != null;
5843     }
5844
5845     /**
5846      * Based on the nextKv in the heap, and the current row, decide whether or not there are more
5847      * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
5848      * then there are more cells to be read in the row.
5849      * @param nextKv
5850      * @param currentRowCell
5851      * @return true When there are more cells in the row to be read
5852      */
5853     private boolean moreCellsInRow(final Cell nextKv, Cell currentRowCell) {
5854       return nextKv != null && CellUtil.matchingRow(nextKv, currentRowCell);
5855     }
5856
5857     /*
5858      * @return True if a filter rules the scanner is over, done.
5859      */
5860     @Override
5861     public synchronized boolean isFilterDone() throws IOException {
5862       return isFilterDoneInternal();
5863     }
5864
5865     private boolean isFilterDoneInternal() throws IOException {
5866       return this.filter != null && this.filter.filterAllRemaining();
5867     }
5868
5869     private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
5870         throws IOException {
5871       if (!results.isEmpty()) {
5872         throw new IllegalArgumentException("First parameter should be an empty list");
5873       }
5874       if (scannerContext == null) {
5875         throw new IllegalArgumentException("Scanner context cannot be null");
5876       }
5877       RpcCallContext rpcCall = RpcServer.getCurrentCall();
5878
5879       // Save the initial progress from the Scanner context in these local variables. The progress
5880       // may need to be reset a few times if rows are being filtered out so we save the initial
5881       // progress.
5882       int initialBatchProgress = scannerContext.getBatchProgress();
5883       long initialSizeProgress = scannerContext.getSizeProgress();
5884       long initialTimeProgress = scannerContext.getTimeProgress();
5885
5886       // The loop here is used only when at some point during the next we determine
5887       // that due to effects of filters or otherwise, we have an empty row in the result.
5888       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
5889       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
5890       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
5891       while (true) {
5892         // Starting to scan a new row. Reset the scanner progress according to whether or not
5893         // progress should be kept.
5894         if (scannerContext.getKeepProgress()) {
5895           // Progress should be kept. Reset to initial values seen at start of method invocation.
5896           scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
5897             initialTimeProgress);
5898         } else {
5899           scannerContext.clearProgress();
5900         }
5901
5902         if (rpcCall != null) {
5903           // If a user specifies a too-restrictive or too-slow scanner, the
5904           // client might time out and disconnect while the server side
5905           // is still processing the request. We should abort aggressively
5906           // in that case.
5907           long afterTime = rpcCall.disconnectSince();
5908           if (afterTime >= 0) {
5909             throw new CallerDisconnectedException(
5910                 "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " +
5911                     this + " after " + afterTime + " ms, since " +
5912                     "caller disconnected");
5913           }
5914         }
5915
5916         // Let's see what we have in the storeHeap.
5917         Cell current = this.storeHeap.peek();
5918
5919         boolean stopRow = isStopRow(current);
5920         // When has filter row is true it means that the all the cells for a particular row must be
5921         // read before a filtering decision can be made. This means that filters where hasFilterRow
5922         // run the risk of encountering out of memory errors in the case that they are applied to a
5923         // table that has very large rows.
5924         boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();
5925
5926         // If filter#hasFilterRow is true, partial results are not allowed since allowing them
5927         // would prevent the filters from being evaluated. Thus, if it is true, change the
5928         // scope of any limits that could potentially create partial results to
5929         // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row
5930         if (hasFilterRow) {
5931           if (LOG.isTraceEnabled()) {
5932             LOG.trace("filter#hasFilterRow is true which prevents partial results from being "
5933                 + " formed. Changing scope of limits that may create partials");
5934           }
5935           scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS);
5936           scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS);
5937         }
5938
5939         // Check if we were getting data from the joinedHeap and hit the limit.
5940         // If not, then it's main path - getting results from storeHeap.
5941         if (joinedContinuationRow == null) {
5942           // First, check if we are at a stop row. If so, there are no more results.
5943           if (stopRow) {
5944             if (hasFilterRow) {
5945               filter.filterRowCells(results);
5946             }
5947             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5948           }
5949
5950           // Check if rowkey filter wants to exclude this row. If so, loop to next.
5951           // Technically, if we hit limits before on this row, we don't need this call.
5952           if (filterRowKey(current)) {
5953             incrementCountOfRowsFilteredMetric(scannerContext);
5954             // Typically the count of rows scanned is incremented inside #populateResult. However,
5955             // here we are filtering a row based purely on its row key, preventing us from calling
5956             // #populateResult. Thus, perform the necessary increment here to rows scanned metric
5957             incrementCountOfRowsScannedMetric(scannerContext);
5958             boolean moreRows = nextRow(scannerContext, current);
5959             if (!moreRows) {
5960               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5961             }
5962             results.clear();
5963             continue;
5964           }
5965
5966           // Ok, we are good, let's try to get some results from the main heap.
5967           populateResult(results, this.storeHeap, scannerContext, current);
5968
5969           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5970             if (hasFilterRow) {
5971               throw new IncompatibleFilterException(
5972                   "Filter whose hasFilterRow() returns true is incompatible with scans that must "
5973                       + " stop mid-row because of a limit. ScannerContext:" + scannerContext);
5974             }
5975             return true;
5976           }
5977
5978           Cell nextKv = this.storeHeap.peek();
5979           stopRow = nextKv == null || isStopRow(nextKv);
5980           // save that the row was empty before filters applied to it.
5981           final boolean isEmptyRow = results.isEmpty();
5982
5983           // We have the part of the row necessary for filtering (all of it, usually).
5984           // First filter with the filterRow(List).
5985           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
5986           if (hasFilterRow) {
5987             ret = filter.filterRowCellsWithRet(results);
5988
5989             // We don't know how the results have changed after being filtered. Must set progress
5990             // according to contents of results now. However, a change in the results should not
5991             // affect the time progress. Thus preserve whatever time progress has been made
5992             long timeProgress = scannerContext.getTimeProgress();
5993             if (scannerContext.getKeepProgress()) {
5994               scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
5995                 initialTimeProgress);
5996             } else {
5997               scannerContext.clearProgress();
5998             }
5999             scannerContext.setTimeProgress(timeProgress);
6000             scannerContext.incrementBatchProgress(results.size());
6001             for (Cell cell : results) {
6002               scannerContext.incrementSizeProgress(CellUtil.estimatedHeapSizeOf(cell));
6003             }
6004           }
6005
6006           if (isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE || filterRow()) {
6007             incrementCountOfRowsFilteredMetric(scannerContext);
6008             results.clear();
6009             boolean moreRows = nextRow(scannerContext, current);
6010             if (!moreRows) {
6011               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6012             }
6013
6014             // This row was totally filtered out, if this is NOT the last row,
6015             // we should continue on. Otherwise, nothing else to do.
6016             if (!stopRow) continue;
6017             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6018           }
6019
6020           // Ok, we are done with storeHeap for this row.
6021           // Now we may need to fetch additional, non-essential data into row.
6022           // These values are not needed for filter to work, so we postpone their
6023           // fetch to (possibly) reduce amount of data loads from disk.
6024           if (this.joinedHeap != null) {
6025             boolean mayHaveData = joinedHeapMayHaveData(current);
6026             if (mayHaveData) {
6027               joinedContinuationRow = current;
6028               populateFromJoinedHeap(results, scannerContext);
6029
6030               if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6031                 return true;
6032               }
6033             }
6034           }
6035         } else {
6036           // Populating from the joined heap was stopped by limits, populate some more.
6037           populateFromJoinedHeap(results, scannerContext);
6038           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6039             return true;
6040           }
6041         }
6042         // We may have just called populateFromJoinedMap and hit the limits. If that is
6043         // the case, we need to call it again on the next next() invocation.
6044         if (joinedContinuationRow != null) {
6045           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6046         }
6047
6048         // Finally, we are done with both joinedHeap and storeHeap.
6049         // Double check to prevent empty rows from appearing in result. It could be
6050         // the case when SingleColumnValueExcludeFilter is used.
6051         if (results.isEmpty()) {
6052           incrementCountOfRowsFilteredMetric(scannerContext);
6053           boolean moreRows = nextRow(scannerContext, current);
6054           if (!moreRows) {
6055             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6056           }
6057           if (!stopRow) continue;
6058         }
6059
6060         if (stopRow) {
6061           return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6062         } else {
6063           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6064         }
6065       }
6066     }
6067
6068     protected void incrementCountOfRowsFilteredMetric(ScannerContext scannerContext) {
6069       filteredReadRequestsCount.increment();
6070
6071       if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
6072
6073       scannerContext.getMetrics().countOfRowsFiltered.incrementAndGet();
6074     }
6075
6076     protected void incrementCountOfRowsScannedMetric(ScannerContext scannerContext) {
6077       if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
6078
6079       scannerContext.getMetrics().countOfRowsScanned.incrementAndGet();
6080     }
6081
6082     /**
6083      * @param currentRowCell
6084      * @return true when the joined heap may have data for the current row
6085      * @throws IOException
6086      */
6087     private boolean joinedHeapMayHaveData(Cell currentRowCell)
6088         throws IOException {
6089       Cell nextJoinedKv = joinedHeap.peek();
6090       boolean matchCurrentRow =
6091           nextJoinedKv != null && CellUtil.matchingRow(nextJoinedKv, currentRowCell);
6092       boolean matchAfterSeek = false;
6093
6094       // If the next value in the joined heap does not match the current row, try to seek to the
6095       // correct row
6096       if (!matchCurrentRow) {
6097         Cell firstOnCurrentRow = CellUtil.createFirstOnRow(currentRowCell);
6098         boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true);
6099         matchAfterSeek =
6100             seekSuccessful && joinedHeap.peek() != null
6101                 && CellUtil.matchingRow(joinedHeap.peek(), currentRowCell);
6102       }
6103
6104       return matchCurrentRow || matchAfterSeek;
6105     }
6106
6107     /**
6108      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
6109      * both filterRow & filterRow({@code List<KeyValue> kvs}) functions. While 0.94 code or older,
6110      * it may not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only
6111      * returns true when filterRow({@code List<KeyValue> kvs}) is overridden not the filterRow().
6112      * Therefore, the filterRow() will be skipped.
6113      */
6114     private boolean filterRow() throws IOException {
6115       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
6116       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
6117       return filter != null && (!filter.hasFilterRow())
6118           && filter.filterRow();
6119     }
6120
6121     private boolean filterRowKey(Cell current) throws IOException {
6122       return filter != null && filter.filterRowKey(current);
6123     }
6124
6125     protected boolean nextRow(ScannerContext scannerContext, Cell curRowCell) throws IOException {
6126       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
6127       Cell next;
6128       while ((next = this.storeHeap.peek()) != null &&
6129              CellUtil.matchingRow(next, curRowCell)) {
6130         this.storeHeap.next(MOCKED_LIST);
6131       }
6132       resetFilters();
6133
6134       // Calling the hook in CP which allows it to do a fast forward
6135       return this.region.getCoprocessorHost() == null
6136           || this.region.getCoprocessorHost()
6137               .postScannerFilterRow(this, curRowCell);
6138     }
6139
6140     protected boolean isStopRow(Cell currentRowCell) {
6141       return currentRowCell == null
6142           || (stopRow != null && comparator.compareRows(currentRowCell, stopRow, 0, stopRow
6143           .length) >= isScan);
6144     }
6145
6146     @Override
6147     public synchronized void close() {
6148       if (storeHeap != null) {
6149         storeHeap.close();
6150         storeHeap = null;
6151       }
6152       if (joinedHeap != null) {
6153         joinedHeap.close();
6154         joinedHeap = null;
6155       }
6156       // no need to synchronize here.
6157       scannerReadPoints.remove(this);
6158       this.filterClosed = true;
6159     }
6160
6161     KeyValueHeap getStoreHeapForTesting() {
6162       return storeHeap;
6163     }
6164
6165     @Override
6166     public synchronized boolean reseek(byte[] row) throws IOException {
6167       if (row == null) {
6168         throw new IllegalArgumentException("Row cannot be null.");
6169       }
6170       boolean result = false;
6171       startRegionOperation();
6172       KeyValue kv = KeyValueUtil.createFirstOnRow(row);
6173       try {
6174         // use request seek to make use of the lazy seek option. See HBASE-5520
6175         result = this.storeHeap.requestSeek(kv, true, true);
6176         if (this.joinedHeap != null) {
6177           result = this.joinedHeap.requestSeek(kv, true, true) || result;
6178         }
6179       } catch (FileNotFoundException e) {
6180         throw handleFileNotFound(e);
6181       } finally {
6182         closeRegionOperation();
6183       }
6184       return result;
6185     }
6186
6187     private IOException handleFileNotFound(FileNotFoundException fnfe) throws IOException {
6188       // tries to refresh the store files, otherwise shutdown the RS.
6189       // TODO: add support for abort() of a single region and trigger reassignment.
6190       try {
6191         region.refreshStoreFiles(true);
6192         return new IOException("unable to read store file");
6193       } catch (IOException e) {
6194         String msg = "a store file got lost: " + fnfe.getMessage();
6195         LOG.error("unable to refresh store files", e);
6196         abortRegionServer(msg);
6197         return new NotServingRegionException(
6198           getRegionInfo().getRegionNameAsString() + " is closing");
6199       }
6200     }
6201
6202     private void abortRegionServer(String msg) throws IOException {
6203       if (rsServices instanceof HRegionServer) {
6204         ((HRegionServer)rsServices).abort(msg);
6205       }
6206       throw new UnsupportedOperationException("not able to abort RS after: " + msg);
6207     }
6208
6209     @Override
6210     public void shipped() throws IOException {
6211       if (storeHeap != null) {
6212         storeHeap.shipped();
6213       }
6214       if (joinedHeap != null) {
6215         joinedHeap.shipped();
6216       }
6217     }
6218
6219     @Override
6220     public void run() throws IOException {
6221       // This is the RPC callback method executed. We do the close in of the scanner in this
6222       // callback
6223       this.close();
6224     }
6225   }
6226
6227   // Utility methods
6228   /**
6229    * A utility method to create new instances of HRegion based on the
6230    * {@link HConstants#REGION_IMPL} configuration property.
6231    * @param tableDir qualified path of directory where region should be located,
6232    * usually the table directory.
6233    * @param wal The WAL is the outbound log for any updates to the HRegion
6234    * The wal file is a logfile from the previous execution that's
6235    * custom-computed for this HRegion. The HRegionServer computes and sorts the
6236    * appropriate wal info for this HRegion. If there is a previous file
6237    * (implying that the HRegion has been written-to before), then read it from
6238    * the supplied path.
6239    * @param fs is the filesystem.
6240    * @param conf is global configuration settings.
6241    * @param regionInfo - HRegionInfo that describes the region
6242    * is new), then read them from the supplied path.
6243    * @param htd the table descriptor
6244    * @return the new instance
6245    */
6246   static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
6247       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
6248       RegionServerServices rsServices) {
6249     try {
6250       @SuppressWarnings("unchecked")
6251       Class<? extends HRegion> regionClass =
6252           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
6253
6254       Constructor<? extends HRegion> c =
6255           regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
6256               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
6257               RegionServerServices.class);
6258
6259       return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
6260     } catch (Throwable e) {
6261       // todo: what should I throw here?
6262       throw new IllegalStateException("Could not instantiate a region instance.", e);
6263     }
6264   }
6265
6266   /**
6267    * Convenience method creating new HRegions. Used by createTable.
6268    *
6269    * @param info Info for region to create.
6270    * @param rootDir Root directory for HBase instance
6271    * @param wal shared WAL
6272    * @param initialize - true to initialize the region
6273    * @return new HRegion
6274    * @throws IOException
6275    */
6276   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6277         final Configuration conf, final HTableDescriptor hTableDescriptor,
6278         final WAL wal, final boolean initialize)
6279   throws IOException {
6280     LOG.info("creating HRegion " + info.getTable().getNameAsString()
6281         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
6282         " Table name == " + info.getTable().getNameAsString());
6283     FileSystem fs = FileSystem.get(conf);
6284     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6285     HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
6286     HRegion region = HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, null);
6287     if (initialize) region.initialize(null);
6288     return region;
6289   }
6290
6291   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6292                                       final Configuration conf,
6293                                       final HTableDescriptor hTableDescriptor,
6294                                       final WAL wal)
6295     throws IOException {
6296     return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
6297   }
6298
6299
6300   /**
6301    * Open a Region.
6302    * @param info Info for region to be opened.
6303    * @param wal WAL for region to use. This method will call
6304    * WAL#setSequenceNumber(long) passing the result of the call to
6305    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6306    * up.  HRegionStore does this every time it opens a new region.
6307    * @return new HRegion
6308    *
6309    * @throws IOException
6310    */
6311   public static HRegion openHRegion(final HRegionInfo info,
6312       final HTableDescriptor htd, final WAL wal,
6313       final Configuration conf)
6314   throws IOException {
6315     return openHRegion(info, htd, wal, conf, null, null);
6316   }
6317
6318   /**
6319    * Open a Region.
6320    * @param info Info for region to be opened
6321    * @param htd the table descriptor
6322    * @param wal WAL for region to use. This method will call
6323    * WAL#setSequenceNumber(long) passing the result of the call to
6324    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6325    * up.  HRegionStore does this every time it opens a new region.
6326    * @param conf The Configuration object to use.
6327    * @param rsServices An interface we can request flushes against.
6328    * @param reporter An interface we can report progress against.
6329    * @return new HRegion
6330    *
6331    * @throws IOException
6332    */
6333   public static HRegion openHRegion(final HRegionInfo info,
6334     final HTableDescriptor htd, final WAL wal, final Configuration conf,
6335     final RegionServerServices rsServices,
6336     final CancelableProgressable reporter)
6337   throws IOException {
6338     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
6339   }
6340
6341   /**
6342    * Open a Region.
6343    * @param rootDir Root directory for HBase instance
6344    * @param info Info for region to be opened.
6345    * @param htd the table descriptor
6346    * @param wal WAL for region to use. This method will call
6347    * WAL#setSequenceNumber(long) passing the result of the call to
6348    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6349    * up.  HRegionStore does this every time it opens a new region.
6350    * @param conf The Configuration object to use.
6351    * @return new HRegion
6352    * @throws IOException
6353    */
6354   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
6355       final HTableDescriptor htd, final WAL wal, final Configuration conf)
6356   throws IOException {
6357     return openHRegion(rootDir, info, htd, wal, conf, null, null);
6358   }
6359
6360   /**
6361    * Open a Region.
6362    * @param rootDir Root directory for HBase instance
6363    * @param info Info for region to be opened.
6364    * @param htd the table descriptor
6365    * @param wal WAL for region to use. This method will call
6366    * WAL#setSequenceNumber(long) passing the result of the call to
6367    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6368    * up.  HRegionStore does this every time it opens a new region.
6369    * @param conf The Configuration object to use.
6370    * @param rsServices An interface we can request flushes against.
6371    * @param reporter An interface we can report progress against.
6372    * @return new HRegion
6373    * @throws IOException
6374    */
6375   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
6376       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6377       final RegionServerServices rsServices,
6378       final CancelableProgressable reporter)
6379   throws IOException {
6380     FileSystem fs = null;
6381     if (rsServices != null) {
6382       fs = rsServices.getFileSystem();
6383     }
6384     if (fs == null) {
6385       fs = FileSystem.get(conf);
6386     }
6387     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
6388   }
6389
6390   /**
6391    * Open a Region.
6392    * @param conf The Configuration object to use.
6393    * @param fs Filesystem to use
6394    * @param rootDir Root directory for HBase instance
6395    * @param info Info for region to be opened.
6396    * @param htd the table descriptor
6397    * @param wal WAL for region to use. This method will call
6398    * WAL#setSequenceNumber(long) passing the result of the call to
6399    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6400    * up.  HRegionStore does this every time it opens a new region.
6401    * @return new HRegion
6402    * @throws IOException
6403    */
6404   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6405       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal)
6406       throws IOException {
6407     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
6408   }
6409
6410   /**
6411    * Open a Region.
6412    * @param conf The Configuration object to use.
6413    * @param fs Filesystem to use
6414    * @param rootDir Root directory for HBase instance
6415    * @param info Info for region to be opened.
6416    * @param htd the table descriptor
6417    * @param wal WAL for region to use. This method will call
6418    * WAL#setSequenceNumber(long) passing the result of the call to
6419    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6420    * up.  HRegionStore does this every time it opens a new region.
6421    * @param rsServices An interface we can request flushes against.
6422    * @param reporter An interface we can report progress against.
6423    * @return new HRegion
6424    * @throws IOException
6425    */
6426   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6427       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal,
6428       final RegionServerServices rsServices, final CancelableProgressable reporter)
6429       throws IOException {
6430     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6431     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
6432   }
6433
6434   /**
6435    * Open a Region.
6436    * @param conf The Configuration object to use.
6437    * @param fs Filesystem to use
6438    * @param rootDir Root directory for HBase instance
6439    * @param info Info for region to be opened.
6440    * @param htd the table descriptor
6441    * @param wal WAL for region to use. This method will call
6442    * WAL#setSequenceNumber(long) passing the result of the call to
6443    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6444    * up.  HRegionStore does this every time it opens a new region.
6445    * @param rsServices An interface we can request flushes against.
6446    * @param reporter An interface we can report progress against.
6447    * @return new HRegion
6448    * @throws IOException
6449    */
6450   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6451       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd,
6452       final WAL wal, final RegionServerServices rsServices,
6453       final CancelableProgressable reporter)
6454       throws IOException {
6455     if (info == null) throw new NullPointerException("Passed region info is null");
6456     if (LOG.isDebugEnabled()) {
6457       LOG.debug("Opening region: " + info);
6458     }
6459     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6460     return r.openHRegion(reporter);
6461   }
6462
6463   @VisibleForTesting
6464   public NavigableMap<byte[], Integer> getReplicationScope() {
6465     return this.replicationScope;
6466   }
6467
6468   /**
6469    * Useful when reopening a closed region (normally for unit tests)
6470    * @param other original object
6471    * @param reporter An interface we can report progress against.
6472    * @return new HRegion
6473    * @throws IOException
6474    */
6475   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
6476       throws IOException {
6477     HRegionFileSystem regionFs = other.getRegionFileSystem();
6478     HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
6479         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
6480     return r.openHRegion(reporter);
6481   }
6482
6483   public static Region openHRegion(final Region other, final CancelableProgressable reporter)
6484         throws IOException {
6485     return openHRegion((HRegion)other, reporter);
6486   }
6487
6488   /**
6489    * Open HRegion.
6490    * Calls initialize and sets sequenceId.
6491    * @return Returns <code>this</code>
6492    * @throws IOException
6493    */
6494   protected HRegion openHRegion(final CancelableProgressable reporter)
6495   throws IOException {
6496     // Refuse to open the region if we are missing local compression support
6497     checkCompressionCodecs();
6498     // Refuse to open the region if encryption configuration is incorrect or
6499     // codec support is missing
6500     checkEncryption();
6501     // Refuse to open the region if a required class cannot be loaded
6502     checkClassLoading();
6503     this.openSeqNum = initialize(reporter);
6504     this.mvcc.advanceTo(openSeqNum);
6505     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
6506         && !recovering) {
6507       // Only write the region open event marker to WAL if (1) we are not read-only
6508       // (2) dist log replay is off or we are not recovering. In case region is
6509       // recovering, the open event will be written at setRecovering(false)
6510       writeRegionOpenMarker(wal, openSeqNum);
6511     }
6512     return this;
6513   }
6514
6515   public static void warmupHRegion(final HRegionInfo info,
6516       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6517       final RegionServerServices rsServices,
6518       final CancelableProgressable reporter)
6519       throws IOException {
6520
6521     if (info == null) throw new NullPointerException("Passed region info is null");
6522
6523     if (LOG.isDebugEnabled()) {
6524       LOG.debug("HRegion.Warming up region: " + info);
6525     }
6526
6527     Path rootDir = FSUtils.getRootDir(conf);
6528     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6529
6530     FileSystem fs = null;
6531     if (rsServices != null) {
6532       fs = rsServices.getFileSystem();
6533     }
6534     if (fs == null) {
6535       fs = FileSystem.get(conf);
6536     }
6537
6538     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null);
6539     r.initializeWarmup(reporter);
6540   }
6541
6542
6543   private void checkCompressionCodecs() throws IOException {
6544     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6545       CompressionTest.testCompression(fam.getCompressionType());
6546       CompressionTest.testCompression(fam.getCompactionCompressionType());
6547     }
6548   }
6549
6550   private void checkEncryption() throws IOException {
6551     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6552       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
6553     }
6554   }
6555
6556   private void checkClassLoading() throws IOException {
6557     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
6558     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
6559   }
6560
6561   /**
6562    * Create a daughter region from given a temp directory with the region data.
6563    * @param hri Spec. for daughter region to open.
6564    * @throws IOException
6565    */
6566   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
6567     // Move the files from the temporary .splits to the final /table/region directory
6568     fs.commitDaughterRegion(hri);
6569
6570     // Create the daughter HRegion instance
6571     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(), fs.getFileSystem(),
6572         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
6573     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
6574     r.filteredReadRequestsCount.set(this.getFilteredReadRequestsCount() / 2);
6575     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
6576     return r;
6577   }
6578
6579   /**
6580    * Create a merged region given a temp directory with the region data.
6581    * @param region_b another merging region
6582    * @return merged HRegion
6583    * @throws IOException
6584    */
6585   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
6586       final HRegion region_b) throws IOException {
6587     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(),
6588         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
6589         this.getTableDesc(), this.rsServices);
6590     r.readRequestsCount.set(this.getReadRequestsCount()
6591         + region_b.getReadRequestsCount());
6592     r.filteredReadRequestsCount.set(this.getFilteredReadRequestsCount()
6593       + region_b.getFilteredReadRequestsCount());
6594     r.writeRequestsCount.set(this.getWriteRequestsCount()
6595
6596         + region_b.getWriteRequestsCount());
6597     this.fs.commitMergedRegion(mergedRegionInfo);
6598     return r;
6599   }
6600
6601   /**
6602    * Inserts a new region's meta information into the passed
6603    * <code>meta</code> region. Used by the HMaster bootstrap code adding
6604    * new table to hbase:meta table.
6605    *
6606    * @param meta hbase:meta HRegion to be updated
6607    * @param r HRegion to add to <code>meta</code>
6608    *
6609    * @throws IOException
6610    */
6611   // TODO remove since only test and merge use this
6612   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
6613     meta.checkResources();
6614     // The row key is the region name
6615     byte[] row = r.getRegionInfo().getRegionName();
6616     final long now = EnvironmentEdgeManager.currentTime();
6617     final List<Cell> cells = new ArrayList<Cell>(2);
6618     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6619       HConstants.REGIONINFO_QUALIFIER, now,
6620       r.getRegionInfo().toByteArray()));
6621     // Set into the root table the version of the meta table.
6622     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6623       HConstants.META_VERSION_QUALIFIER, now,
6624       Bytes.toBytes(HConstants.META_VERSION)));
6625     meta.put(row, HConstants.CATALOG_FAMILY, cells);
6626   }
6627
6628   /**
6629    * Computes the Path of the HRegion
6630    *
6631    * @param tabledir qualified path for table
6632    * @param name ENCODED region name
6633    * @return Path of HRegion directory
6634    * @deprecated For tests only; to be removed.
6635    */
6636   @Deprecated
6637   public static Path getRegionDir(final Path tabledir, final String name) {
6638     return new Path(tabledir, name);
6639   }
6640
6641   /**
6642    * Computes the Path of the HRegion
6643    *
6644    * @param rootdir qualified path of HBase root directory
6645    * @param info HRegionInfo for the region
6646    * @return qualified path of region directory
6647    * @deprecated For tests only; to be removed.
6648    */
6649   @Deprecated
6650   @VisibleForTesting
6651   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
6652     return new Path(
6653       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
6654   }
6655
6656   /**
6657    * Determines if the specified row is within the row range specified by the
6658    * specified HRegionInfo
6659    *
6660    * @param info HRegionInfo that specifies the row range
6661    * @param row row to be checked
6662    * @return true if the row is within the range specified by the HRegionInfo
6663    */
6664   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
6665     return ((info.getStartKey().length == 0) ||
6666         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
6667         ((info.getEndKey().length == 0) ||
6668             (Bytes.compareTo(info.getEndKey(), row) > 0));
6669   }
6670
6671   public static boolean rowIsInRange(HRegionInfo info, final byte [] row, final int offset,
6672       final short length) {
6673     return ((info.getStartKey().length == 0) ||
6674         (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length,
6675           row, offset, length) <= 0)) &&
6676         ((info.getEndKey().length == 0) ||
6677           (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0));
6678   }
6679
6680   /**
6681    * Merge two HRegions.  The regions must be adjacent and must not overlap.
6682    *
6683    * @return new merged HRegion
6684    * @throws IOException
6685    */
6686   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
6687   throws IOException {
6688     HRegion a = srcA;
6689     HRegion b = srcB;
6690
6691     // Make sure that srcA comes first; important for key-ordering during
6692     // write of the merged file.
6693     if (srcA.getRegionInfo().getStartKey() == null) {
6694       if (srcB.getRegionInfo().getStartKey() == null) {
6695         throw new IOException("Cannot merge two regions with null start key");
6696       }
6697       // A's start key is null but B's isn't. Assume A comes before B
6698     } else if ((srcB.getRegionInfo().getStartKey() == null) ||
6699       (Bytes.compareTo(srcA.getRegionInfo().getStartKey(),
6700         srcB.getRegionInfo().getStartKey()) > 0)) {
6701       a = srcB;
6702       b = srcA;
6703     }
6704
6705     if (!(Bytes.compareTo(a.getRegionInfo().getEndKey(),
6706         b.getRegionInfo().getStartKey()) == 0)) {
6707       throw new IOException("Cannot merge non-adjacent regions");
6708     }
6709     return merge(a, b);
6710   }
6711
6712   /**
6713    * Merge two regions whether they are adjacent or not.
6714    *
6715    * @param a region a
6716    * @param b region b
6717    * @return new merged region
6718    * @throws IOException
6719    */
6720   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
6721     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
6722       throw new IOException("Regions do not belong to the same table");
6723     }
6724
6725     FileSystem fs = a.getRegionFileSystem().getFileSystem();
6726     // Make sure each region's cache is empty
6727     a.flush(true);
6728     b.flush(true);
6729
6730     // Compact each region so we only have one store file per family
6731     a.compact(true);
6732     if (LOG.isDebugEnabled()) {
6733       LOG.debug("Files for region: " + a);
6734       a.getRegionFileSystem().logFileSystemState(LOG);
6735     }
6736     b.compact(true);
6737     if (LOG.isDebugEnabled()) {
6738       LOG.debug("Files for region: " + b);
6739       b.getRegionFileSystem().logFileSystemState(LOG);
6740     }
6741
6742     RegionMergeTransactionImpl rmt = new RegionMergeTransactionImpl(a, b, true);
6743     if (!rmt.prepare(null)) {
6744       throw new IOException("Unable to merge regions " + a + " and " + b);
6745     }
6746     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
6747     LOG.info("starting merge of regions: " + a + " and " + b
6748         + " into new region " + mergedRegionInfo.getRegionNameAsString()
6749         + " with start key <"
6750         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
6751         + "> and end key <"
6752         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
6753     HRegion dstRegion;
6754     try {
6755       dstRegion = (HRegion)rmt.execute(null, null);
6756     } catch (IOException ioe) {
6757       rmt.rollback(null, null);
6758       throw new IOException("Failed merging region " + a + " and " + b
6759           + ", and successfully rolled back");
6760     }
6761     dstRegion.compact(true);
6762
6763     if (LOG.isDebugEnabled()) {
6764       LOG.debug("Files for new region");
6765       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
6766     }
6767
6768     // clear the compacted files if any
6769     for (Store s : dstRegion.getStores()) {
6770       s.closeAndArchiveCompactedFiles();
6771     }
6772     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
6773       throw new IOException("Merged region " + dstRegion
6774           + " still has references after the compaction, is compaction canceled?");
6775     }
6776
6777     // Archiving the 'A' region
6778     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
6779     // Archiving the 'B' region
6780     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
6781
6782     LOG.info("merge completed. New region is " + dstRegion);
6783     return dstRegion;
6784   }
6785
6786   @Override
6787   public Result get(final Get get) throws IOException {
6788     prepareGet(get);
6789     List<Cell> results = get(get, true);
6790     boolean stale = this.getRegionInfo().getReplicaId() != 0;
6791     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
6792   }
6793
6794    void prepareGet(final Get get) throws IOException, NoSuchColumnFamilyException {
6795     checkRow(get.getRow(), "Get");
6796     // Verify families are all valid
6797     if (get.hasFamilies()) {
6798       for (byte [] family: get.familySet()) {
6799         checkFamily(family);
6800       }
6801     } else { // Adding all families to scanner
6802       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
6803         get.addFamily(family);
6804       }
6805     }
6806   }
6807
6808   @Override
6809   public List<Cell> get(Get get, boolean withCoprocessor) throws IOException {
6810
6811     List<Cell> results = new ArrayList<Cell>();
6812
6813     // pre-get CP hook
6814     if (withCoprocessor && (coprocessorHost != null)) {
6815       if (coprocessorHost.preGet(get, results)) {
6816         return results;
6817       }
6818     }
6819     long before =  EnvironmentEdgeManager.currentTime();
6820     Scan scan = new Scan(get);
6821
6822     RegionScanner scanner = null;
6823     try {
6824       scanner = getScanner(scan);
6825       scanner.next(results);
6826     } finally {
6827       if (scanner != null)
6828         scanner.close();
6829     }
6830
6831     // post-get CP hook
6832     if (withCoprocessor && (coprocessorHost != null)) {
6833       coprocessorHost.postGet(get, results);
6834     }
6835
6836     metricsUpdateForGet(results, before);
6837
6838     return results;
6839   }
6840
6841   void metricsUpdateForGet(List<Cell> results, long before) {
6842     if (this.metricsRegion != null) {
6843       long totalSize = 0L;
6844       for (Cell cell : results) {
6845         // This should give an estimate of the cell in the result. Why do we need
6846         // to know the serialization of how the codec works with it??
6847         totalSize += CellUtil.estimatedSerializedSizeOf(cell);
6848       }
6849       this.metricsRegion.updateGetSize(totalSize);
6850       this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before);
6851     }
6852   }
6853
6854   @Override
6855   public void mutateRow(RowMutations rm) throws IOException {
6856     // Don't need nonces here - RowMutations only supports puts and deletes
6857     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
6858   }
6859
6860   /**
6861    * Perform atomic mutations within the region w/o nonces.
6862    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
6863    */
6864   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6865       Collection<byte[]> rowsToLock) throws IOException {
6866     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
6867   }
6868
6869   /**
6870    * Perform atomic mutations within the region.
6871    * @param mutations The list of mutations to perform.
6872    * <code>mutations</code> can contain operations for multiple rows.
6873    * Caller has to ensure that all rows are contained in this region.
6874    * @param rowsToLock Rows to lock
6875    * @param nonceGroup Optional nonce group of the operation (client Id)
6876    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6877    * If multiple rows are locked care should be taken that
6878    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
6879    * @throws IOException
6880    */
6881   @Override
6882   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6883       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
6884     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
6885     processRowsWithLocks(proc, -1, nonceGroup, nonce);
6886   }
6887
6888   /**
6889    * @return statistics about the current load of the region
6890    */
6891   public ClientProtos.RegionLoadStats getLoadStatistics() {
6892     if (!regionStatsEnabled) {
6893       return null;
6894     }
6895     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
6896     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
6897         .memstoreFlushSize)));
6898     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
6899     stats.setCompactionPressure((int)rsServices.getCompactionPressure()*100 > 100 ? 100 :
6900                 (int)rsServices.getCompactionPressure()*100);
6901     return stats.build();
6902   }
6903
6904   @Override
6905   public void processRowsWithLocks(RowProcessor<?,?> processor) throws IOException {
6906     processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE,
6907       HConstants.NO_NONCE);
6908   }
6909
6910   @Override
6911   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
6912       throws IOException {
6913     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
6914   }
6915
6916   @Override
6917   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
6918       long nonceGroup, long nonce) throws IOException {
6919     for (byte[] row : processor.getRowsToLock()) {
6920       checkRow(row, "processRowsWithLocks");
6921     }
6922     if (!processor.readOnly()) {
6923       checkReadOnly();
6924     }
6925     checkResources();
6926     startRegionOperation();
6927     WALEdit walEdit = new WALEdit();
6928
6929     // STEP 1. Run pre-process hook
6930     preProcess(processor, walEdit);
6931     // Short circuit the read only case
6932     if (processor.readOnly()) {
6933       try {
6934         long now = EnvironmentEdgeManager.currentTime();
6935         doProcessRowWithTimeout(processor, now, this, null, null, timeout);
6936         processor.postProcess(this, walEdit, true);
6937       } finally {
6938         closeRegionOperation();
6939       }
6940       return;
6941     }
6942
6943     boolean locked;
6944     List<RowLock> acquiredRowLocks;
6945     long addedSize = 0;
6946     List<Mutation> mutations = new ArrayList<Mutation>();
6947     Collection<byte[]> rowsToLock = processor.getRowsToLock();
6948     // This is assigned by mvcc either explicity in the below or in the guts of the WAL append
6949     // when it assigns the edit a sequencedid (A.K.A the mvcc write number).
6950     WriteEntry writeEntry = null;
6951     try {
6952       // STEP 2. Acquire the row lock(s)
6953       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
6954       for (byte[] row : rowsToLock) {
6955         // Attempt to lock all involved rows, throw if any lock times out
6956         // use a writer lock for mixed reads and writes
6957         acquiredRowLocks.add(getRowLockInternal(row, false));
6958       }
6959       // STEP 3. Region lock
6960       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
6961       locked = true;
6962       boolean success = false;
6963       long now = EnvironmentEdgeManager.currentTime();
6964       try {
6965         // STEP 4. Let the processor scan the rows, generate mutations and add waledits
6966         doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout);
6967         if (!mutations.isEmpty()) {
6968           // STEP 5. Call the preBatchMutate hook
6969           processor.preBatchMutate(this, walEdit);
6970
6971           // STEP 6. Append and sync if walEdit has data to write out.
6972           if (!walEdit.isEmpty()) {
6973             writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()),
6974                 processor.getClusterIds(), now, nonceGroup, nonce);
6975           } else {
6976             // We are here if WAL is being skipped.
6977             writeEntry = this.mvcc.begin();
6978           }
6979
6980           // STEP 7. Apply to memstore
6981           long sequenceId = writeEntry.getWriteNumber();
6982           for (Mutation m : mutations) {
6983             // Handle any tag based cell features.
6984             // TODO: Do we need to call rewriteCellTags down in applyToMemstore()? Why not before
6985             // so tags go into WAL?
6986             rewriteCellTags(m.getFamilyCellMap(), m);
6987             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
6988               Cell cell = cellScanner.current();
6989               if (walEdit.isEmpty()) {
6990                 // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id.
6991                 // If no WAL, need to stamp it here.
6992                 CellUtil.setSequenceId(cell, sequenceId);
6993               }
6994               Store store = getStore(cell);
6995               addedSize += applyToMemstore(store, cell, sequenceId);
6996             }
6997           }
6998           // STEP 8. Complete mvcc.
6999           mvcc.completeAndWait(writeEntry);
7000           writeEntry = null;
7001
7002           // STEP 9. Release region lock
7003           if (locked) {
7004             this.updatesLock.readLock().unlock();
7005             locked = false;
7006           }
7007
7008           // STEP 10. Release row lock(s)
7009           releaseRowLocks(acquiredRowLocks);
7010
7011           // STEP 11. call postBatchMutate hook
7012           processor.postBatchMutate(this);
7013         }
7014         success = true;
7015       } finally {
7016         // Call complete rather than completeAndWait because we probably had error if walKey != null
7017         if (writeEntry != null) mvcc.complete(writeEntry);
7018         if (locked) {
7019           this.updatesLock.readLock().unlock();
7020         }
7021         // release locks if some were acquired but another timed out
7022         releaseRowLocks(acquiredRowLocks);
7023       }
7024
7025       // 12. Run post-process hook
7026       processor.postProcess(this, walEdit, success);
7027     } finally {
7028       closeRegionOperation();
7029       if (!mutations.isEmpty()) {
7030         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
7031         requestFlushIfNeeded(newSize);
7032       }
7033     }
7034   }
7035
7036   private void preProcess(final RowProcessor<?,?> processor, final WALEdit walEdit)
7037   throws IOException {
7038     try {
7039       processor.preProcess(this, walEdit);
7040     } catch (IOException e) {
7041       closeRegionOperation();
7042       throw e;
7043     }
7044   }
7045
7046   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
7047                                        final long now,
7048                                        final HRegion region,
7049                                        final List<Mutation> mutations,
7050                                        final WALEdit walEdit,
7051                                        final long timeout) throws IOException {
7052     // Short circuit the no time bound case.
7053     if (timeout < 0) {
7054       try {
7055         processor.process(now, region, mutations, walEdit);
7056       } catch (IOException e) {
7057         LOG.warn("RowProcessor:" + processor.getClass().getName() +
7058             " throws Exception on row(s):" +
7059             Bytes.toStringBinary(
7060               processor.getRowsToLock().iterator().next()) + "...", e);
7061         throw e;
7062       }
7063       return;
7064     }
7065
7066     // Case with time bound
7067     FutureTask<Void> task =
7068       new FutureTask<Void>(new Callable<Void>() {
7069         @Override
7070         public Void call() throws IOException {
7071           try {
7072             processor.process(now, region, mutations, walEdit);
7073             return null;
7074           } catch (IOException e) {
7075             LOG.warn("RowProcessor:" + processor.getClass().getName() +
7076                 " throws Exception on row(s):" +
7077                 Bytes.toStringBinary(
7078                     processor.getRowsToLock().iterator().next()) + "...", e);
7079             throw e;
7080           }
7081         }
7082       });
7083     rowProcessorExecutor.execute(task);
7084     try {
7085       task.get(timeout, TimeUnit.MILLISECONDS);
7086     } catch (TimeoutException te) {
7087       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
7088           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
7089           "...");
7090       throw new IOException(te);
7091     } catch (Exception e) {
7092       throw new IOException(e);
7093     }
7094   }
7095
7096   public Result append(Append append) throws IOException {
7097     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
7098   }
7099
7100   @Override
7101   public Result append(Append mutation, long nonceGroup, long nonce) throws IOException {
7102     return doDelta(Operation.APPEND, mutation, nonceGroup, nonce, mutation.isReturnResults());
7103   }
7104
7105   public Result increment(Increment increment) throws IOException {
7106     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
7107   }
7108
7109   @Override
7110   public Result increment(Increment mutation, long nonceGroup, long nonce)
7111   throws IOException {
7112     return doDelta(Operation.INCREMENT, mutation, nonceGroup, nonce, mutation.isReturnResults());
7113   }
7114
7115   /**
7116    * Add "deltas" to Cells. Deltas are increments or appends. Switch on <code>op</code>.
7117    *
7118    * <p>If increment, add deltas to current values or if an append, then
7119    * append the deltas to the current Cell values.
7120    *
7121    * <p>Append and Increment code paths are mostly the same. They differ in just a few places.
7122    * This method does the code path for increment and append and then in key spots, switches
7123    * on the passed in <code>op</code> to do increment or append specific paths.
7124    */
7125   private Result doDelta(Operation op, Mutation mutation, long nonceGroup, long nonce,
7126       boolean returnResults)
7127   throws IOException {
7128     checkReadOnly();
7129     checkResources();
7130     checkRow(mutation.getRow(), op.toString());
7131     checkFamilies(mutation.getFamilyCellMap().keySet());
7132     this.writeRequestsCount.increment();
7133     WriteEntry writeEntry = null;
7134     startRegionOperation(op);
7135     long accumulatedResultSize = 0;
7136     List<Cell> results = returnResults? new ArrayList<Cell>(mutation.size()): null;
7137     RowLock rowLock = getRowLockInternal(mutation.getRow(), false);
7138     try {
7139       lock(this.updatesLock.readLock());
7140       try {
7141         Result cpResult = doCoprocessorPreCall(op, mutation);
7142         if (cpResult != null) {
7143           return returnResults? cpResult: null;
7144         }
7145         Durability effectiveDurability = getEffectiveDurability(mutation.getDurability());
7146         Map<Store, List<Cell>> forMemStore =
7147             new HashMap<Store, List<Cell>>(mutation.getFamilyCellMap().size());
7148         // Reckon Cells to apply to WAL --  in returned walEdit -- and what to add to memstore and
7149         // what to return back to the client (in 'forMemStore' and 'results' respectively).
7150         WALEdit walEdit = reckonDeltas(op, mutation, effectiveDurability, forMemStore, results);
7151         // Actually write to WAL now if a walEdit to apply.
7152         if (walEdit != null && !walEdit.isEmpty()) {
7153           writeEntry = doWALAppend(walEdit, durability, nonceGroup, nonce);
7154         } else {
7155           // If walEdits is empty, it means we skipped the WAL; update counters and start an mvcc
7156           // transaction.
7157           recordMutationWithoutWal(mutation.getFamilyCellMap());
7158           writeEntry = mvcc.begin();
7159         }
7160         // Now write to MemStore. Do it a column family at a time.
7161         long sequenceId = writeEntry.getWriteNumber();
7162         for (Map.Entry<Store, List<Cell>> e: forMemStore.entrySet()) {
7163           accumulatedResultSize +=
7164               applyToMemstore(e.getKey(), e.getValue(), true, false, sequenceId);
7165         }
7166         mvcc.completeAndWait(writeEntry);
7167         writeEntry = null;
7168       } finally {
7169         this.updatesLock.readLock().unlock();
7170       }
7171       // If results is null, then client asked that we not return the calculated results.
7172       return results != null && returnResults? Result.create(results): null;
7173     } finally {
7174       // Call complete always, even on success. doDelta is doing a Get READ_UNCOMMITTED when it goes
7175       // to get current value under an exclusive lock so no need so no need to wait to return to
7176       // the client. Means only way to read-your-own-increment or append is to come in with an
7177       // a 0 increment.
7178       if (writeEntry != null) mvcc.complete(writeEntry);
7179       rowLock.release();
7180       // Request a cache flush if over the limit.  Do it outside update lock.
7181       if (isFlushSize(this.addAndGetGlobalMemstoreSize(accumulatedResultSize))) requestFlush();
7182       closeRegionOperation(op);
7183       if (this.metricsRegion != null) {
7184         switch (op) {
7185           case INCREMENT:
7186             this.metricsRegion.updateIncrement();
7187             break;
7188           case APPEND:
7189             this.metricsRegion.updateAppend();
7190             break;
7191           default:
7192             break;
7193         }
7194       }
7195     }
7196   }
7197
7198   private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, long nonceGroup,
7199       long nonce)
7200   throws IOException {
7201     return doWALAppend(walEdit, durability, WALKey.EMPTY_UUIDS, System.currentTimeMillis(),
7202       nonceGroup, nonce);
7203   }
7204
7205   /**
7206    * @return writeEntry associated with this append
7207    */
7208   private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds,
7209       long now, long nonceGroup, long nonce)
7210   throws IOException {
7211     WriteEntry writeEntry = null;
7212     // Using default cluster id, as this can only happen in the originating cluster.
7213     // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey
7214     // here instead of WALKey directly to support legacy coprocessors.
7215     WALKey walKey = new WALKey(this.getRegionInfo().getEncodedNameAsBytes(),
7216       this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now, clusterIds,
7217       nonceGroup, nonce, mvcc, this.getReplicationScope());
7218     try {
7219       long txid =
7220         this.wal.append(this.getRegionInfo(), walKey, walEdit, true);
7221       // Call sync on our edit.
7222       if (txid != 0) sync(txid, durability);
7223       writeEntry = walKey.getWriteEntry();
7224     } catch (IOException ioe) {
7225       if (walKey != null) mvcc.complete(walKey.getWriteEntry());
7226       throw ioe;
7227     }
7228     return writeEntry;
7229   }
7230
7231   /**
7232    * Do coprocessor pre-increment or pre-append call.
7233    * @return Result returned out of the coprocessor, which means bypass all further processing and
7234    *  return the proffered Result instead, or null which means proceed.
7235    */
7236   private Result doCoprocessorPreCall(final Operation op, final Mutation mutation)
7237   throws IOException {
7238     Result result = null;
7239     if (this.coprocessorHost != null) {
7240       switch(op) {
7241         case INCREMENT:
7242           result = this.coprocessorHost.preIncrementAfterRowLock((Increment)mutation);
7243           break;
7244         case APPEND:
7245           result = this.coprocessorHost.preAppendAfterRowLock((Append)mutation);
7246           break;
7247         default: throw new UnsupportedOperationException(op.toString());
7248       }
7249     }
7250     return result;
7251   }
7252
7253   /**
7254    * Reckon the Cells to apply to WAL, memstore, and to return to the Client; these Sets are not
7255    * always the same dependent on whether to write WAL or if the amount to increment is zero (in
7256    * this case we write back nothing, just return latest Cell value to the client).
7257    *
7258    * @param results Fill in here what goes back to the Client if it is non-null (if null, client
7259    *  doesn't want results).
7260    * @param forMemStore Fill in here what to apply to the MemStore (by Store).
7261    * @return A WALEdit to apply to WAL or null if we are to skip the WAL.
7262    */
7263   private WALEdit reckonDeltas(final Operation op, final Mutation mutation,
7264       final Durability effectiveDurability, final Map<Store, List<Cell>> forMemStore,
7265       final List<Cell> results)
7266   throws IOException {
7267     WALEdit walEdit = null;
7268     long now = EnvironmentEdgeManager.currentTime();
7269     final boolean writeToWAL = effectiveDurability != Durability.SKIP_WAL;
7270     // Process a Store/family at a time.
7271     for (Map.Entry<byte [], List<Cell>> entry: mutation.getFamilyCellMap().entrySet()) {
7272       final byte [] columnFamilyName = entry.getKey();
7273       List<Cell> deltas = entry.getValue();
7274       Store store = this.stores.get(columnFamilyName);
7275       // Reckon for the Store what to apply to WAL and MemStore.
7276       List<Cell> toApply =
7277         reckonDeltasByStore(store, op, mutation, effectiveDurability, now, deltas, results);
7278       if (!toApply.isEmpty()) {
7279         forMemStore.put(store, toApply);
7280         if (writeToWAL) {
7281           if (walEdit == null) {
7282             walEdit = new WALEdit();
7283           }
7284           walEdit.getCells().addAll(toApply);
7285         }
7286       }
7287     }
7288     return walEdit;
7289   }
7290
7291   /**
7292    * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed
7293    * column family/Store.
7294    *
7295    * Does Get of current value and then adds passed in deltas for this Store returning the result.
7296    *
7297    * @param op Whether Increment or Append
7298    * @param mutation The encompassing Mutation object
7299    * @param deltas Changes to apply to this Store; either increment amount or data to append
7300    * @param results In here we accumulate all the Cells we are to return to the client; this List
7301    *  can be larger than what we return in case where delta is zero; i.e. don't write
7302    *  out new values, just return current value. If null, client doesn't want results returned.
7303    * @return Resulting Cells after <code>deltas</code> have been applied to current
7304    *  values. Side effect is our filling out of the <code>results</code> List.
7305    */
7306   private List<Cell> reckonDeltasByStore(final Store store, final Operation op,
7307       final Mutation mutation, final Durability effectiveDurability, final long now,
7308       final List<Cell> deltas, final List<Cell> results)
7309   throws IOException {
7310     byte [] columnFamily = store.getFamily().getName();
7311     List<Cell> toApply = new ArrayList<Cell>(deltas.size());
7312     // Get previous values for all columns in this family.
7313     List<Cell> currentValues = get(mutation, store, deltas,
7314         null/*Default IsolationLevel*/,
7315         op == Operation.INCREMENT? ((Increment)mutation).getTimeRange(): null);
7316     // Iterate the input columns and update existing values if they were found, otherwise
7317     // add new column initialized to the delta amount
7318     int currentValuesIndex = 0;
7319     for (int i = 0; i < deltas.size(); i++) {
7320       Cell delta = deltas.get(i);
7321       Cell currentValue = null;
7322       if (currentValuesIndex < currentValues.size() &&
7323           CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta)) {
7324         currentValue = currentValues.get(currentValuesIndex);
7325         if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) {
7326           currentValuesIndex++;
7327         }
7328       }
7329       // Switch on whether this an increment or an append building the new Cell to apply.
7330       Cell newCell = null;
7331       MutationType mutationType = null;
7332       boolean apply = true;
7333       switch (op) {
7334         case INCREMENT:
7335           mutationType = MutationType.INCREMENT;
7336           // If delta amount to apply is 0, don't write WAL or MemStore.
7337           long deltaAmount = getLongValue(delta);
7338           apply = deltaAmount != 0;
7339           newCell = reckonIncrement(delta, deltaAmount, currentValue, columnFamily, now,
7340             (Increment)mutation);
7341           break;
7342         case APPEND:
7343           mutationType = MutationType.APPEND;
7344           // Always apply Append. TODO: Does empty delta value mean reset Cell? It seems to.
7345           newCell = reckonAppend(delta, currentValue, now, (Append)mutation);
7346           break;
7347         default: throw new UnsupportedOperationException(op.toString());
7348       }
7349
7350       // Give coprocessors a chance to update the new cell
7351       if (coprocessorHost != null) {
7352         newCell =
7353             coprocessorHost.postMutationBeforeWAL(mutationType, mutation, currentValue, newCell);
7354       }
7355       // If apply, we need to update memstore/WAL with new value; add it toApply.
7356       if (apply) {
7357         toApply.add(newCell);
7358       }
7359       // Add to results to get returned to the Client. If null, cilent does not want results.
7360       if (results != null) {
7361         results.add(newCell);
7362       }
7363     }
7364     return toApply;
7365   }
7366
7367   /**
7368    * Calculate new Increment Cell.
7369    * @return New Increment Cell with delta applied to currentValue if currentValue is not null;
7370    *  otherwise, a new Cell with the delta set as its value.
7371    */
7372   private Cell reckonIncrement(final Cell delta, final long deltaAmount, final Cell currentValue,
7373       byte [] columnFamily, final long now, Mutation mutation)
7374   throws IOException {
7375     // Forward any tags found on the delta.
7376     List<Tag> tags = TagUtil.carryForwardTags(delta);
7377     long newValue = deltaAmount;
7378     long ts = now;
7379     if (currentValue != null) {
7380       tags = TagUtil.carryForwardTags(tags, currentValue);
7381       ts = Math.max(now, currentValue.getTimestamp());
7382       newValue += getLongValue(currentValue);
7383     }
7384     // Now make up the new Cell. TODO: FIX. This is carnel knowledge of how KeyValues are made...
7385     // doesn't work well with offheaping or if we are doing a different Cell type.
7386     byte [] incrementAmountInBytes = Bytes.toBytes(newValue);
7387     tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
7388     byte [] row = mutation.getRow();
7389     return new KeyValue(row, 0, row.length,
7390       columnFamily, 0, columnFamily.length,
7391       delta.getQualifierArray(), delta.getQualifierOffset(), delta.getQualifierLength(),
7392       ts, KeyValue.Type.Put,
7393       incrementAmountInBytes, 0, incrementAmountInBytes.length,
7394       tags);
7395   }
7396
7397   private Cell reckonAppend(final Cell delta, final Cell currentValue, final long now,
7398       Append mutation)
7399   throws IOException {
7400     // Forward any tags found on the delta.
7401     List<Tag> tags = TagUtil.carryForwardTags(delta);
7402     long ts = now;
7403     Cell newCell = null;
7404     byte [] row = mutation.getRow();
7405     if (currentValue != null) {
7406       tags = TagUtil.carryForwardTags(tags, currentValue);
7407       ts = Math.max(now, currentValue.getTimestamp());
7408       tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
7409       byte[] tagBytes = TagUtil.fromList(tags);
7410       // Allocate an empty cell and copy in all parts.
7411       // TODO: This is intimate knowledge of how a KeyValue is made. Undo!!! Prevents our doing
7412       // other Cell types. Copying on-heap too if an off-heap Cell.
7413       newCell = new KeyValue(row.length, delta.getFamilyLength(),
7414         delta.getQualifierLength(), ts, KeyValue.Type.Put,
7415         delta.getValueLength() + currentValue.getValueLength(),
7416         tagBytes == null? 0: tagBytes.length);
7417       // Copy in row, family, and qualifier
7418       System.arraycopy(row, 0, newCell.getRowArray(), newCell.getRowOffset(), row.length);
7419       System.arraycopy(delta.getFamilyArray(), delta.getFamilyOffset(),
7420           newCell.getFamilyArray(), newCell.getFamilyOffset(), delta.getFamilyLength());
7421       System.arraycopy(delta.getQualifierArray(), delta.getQualifierOffset(),
7422           newCell.getQualifierArray(), newCell.getQualifierOffset(), delta.getQualifierLength());
7423       // Copy in the value
7424       CellUtil.copyValueTo(currentValue, newCell.getValueArray(), newCell.getValueOffset());
7425       System.arraycopy(delta.getValueArray(), delta.getValueOffset(),
7426           newCell.getValueArray(), newCell.getValueOffset() + currentValue.getValueLength(),
7427           delta.getValueLength());
7428       // Copy in tag data
7429       if (tagBytes != null) {
7430         System.arraycopy(tagBytes, 0,
7431             newCell.getTagsArray(), newCell.getTagsOffset(), tagBytes.length);
7432       }
7433     } else {
7434       // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP
7435       CellUtil.updateLatestStamp(delta, now);
7436       newCell = delta;
7437       tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
7438       if (tags != null) {
7439         newCell = new TagRewriteCell(delta, TagUtil.fromList(tags));
7440       }
7441     }
7442     return newCell;
7443   }
7444
7445   /**
7446    * @return Get the long out of the passed in Cell
7447    */
7448   private static long getLongValue(final Cell cell) throws DoNotRetryIOException {
7449     int len = cell.getValueLength();
7450     if (len != Bytes.SIZEOF_LONG) {
7451       // throw DoNotRetryIOException instead of IllegalArgumentException
7452       throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide");
7453     }
7454     return Bytes.toLong(cell.getValueArray(), cell.getValueOffset(), len);
7455   }
7456
7457   /**
7458    * Do a specific Get on passed <code>columnFamily</code> and column qualifiers.
7459    * @param mutation Mutation we are doing this Get for.
7460    * @param store Which column family on row (TODO: Go all Gets in one go)
7461    * @param coordinates Cells from <code>mutation</code> used as coordinates applied to Get.
7462    * @return Return list of Cells found.
7463    */
7464   private List<Cell> get(final Mutation mutation, final Store store,
7465           final List<Cell> coordinates, final IsolationLevel isolation, final TimeRange tr)
7466   throws IOException {
7467     // Sort the cells so that they match the order that they appear in the Get results. Otherwise,
7468     // we won't be able to find the existing values if the cells are not specified in order by the
7469     // client since cells are in an array list.
7470     // TODO: I don't get why we are sorting. St.Ack 20150107
7471     sort(coordinates, store.getComparator());
7472     Get get = new Get(mutation.getRow());
7473     if (isolation != null) {
7474       get.setIsolationLevel(isolation);
7475     }
7476     for (Cell cell: coordinates) {
7477       get.addColumn(store.getFamily().getName(), CellUtil.cloneQualifier(cell));
7478     }
7479     // Increments carry time range. If an Increment instance, put it on the Get.
7480     if (tr != null) {
7481       get.setTimeRange(tr.getMin(), tr.getMax());
7482     }
7483     return get(get, false);
7484   }
7485
7486   /**
7487    * @return Sorted list of <code>cells</code> using <code>comparator</code>
7488    */
7489   private static List<Cell> sort(List<Cell> cells, final Comparator<Cell> comparator) {
7490     Collections.sort(cells, comparator);
7491     return cells;
7492   }
7493
7494   //
7495   // New HBASE-880 Helpers
7496   //
7497
7498   void checkFamily(final byte [] family)
7499   throws NoSuchColumnFamilyException {
7500     if (!this.htableDescriptor.hasFamily(family)) {
7501       throw new NoSuchColumnFamilyException("Column family " +
7502           Bytes.toString(family) + " does not exist in region " + this
7503           + " in table " + this.htableDescriptor);
7504     }
7505   }
7506
7507   public static final long FIXED_OVERHEAD = ClassSize.align(
7508       ClassSize.OBJECT +
7509       ClassSize.ARRAY +
7510       47 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
7511       (14 * Bytes.SIZEOF_LONG) +
7512       5 * Bytes.SIZEOF_BOOLEAN);
7513
7514   // woefully out of date - currently missing:
7515   // 1 x HashMap - coprocessorServiceHandlers
7516   // 6 x Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
7517   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
7518   //   writeRequestsCount
7519   // 1 x HRegion$WriteState - writestate
7520   // 1 x RegionCoprocessorHost - coprocessorHost
7521   // 1 x RegionSplitPolicy - splitPolicy
7522   // 1 x MetricsRegion - metricsRegion
7523   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
7524   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
7525       ClassSize.OBJECT + // closeLock
7526       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
7527       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
7528       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
7529       WriteState.HEAP_SIZE + // writestate
7530       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
7531       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
7532       MultiVersionConcurrencyControl.FIXED_SIZE // mvcc
7533       + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes
7534       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
7535       + ClassSize.STORE_SERVICES // store services
7536       ;
7537
7538   @Override
7539   public long heapSize() {
7540     long heapSize = DEEP_OVERHEAD;
7541     for (Store store : this.stores.values()) {
7542       heapSize += store.heapSize();
7543     }
7544     // this does not take into account row locks, recent flushes, mvcc entries, and more
7545     return heapSize;
7546   }
7547
7548   @Override
7549   public boolean registerService(Service instance) {
7550     /*
7551      * No stacking of instances is allowed for a single service name
7552      */
7553     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
7554     String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc);
7555     if (coprocessorServiceHandlers.containsKey(serviceName)) {
7556       LOG.error("Coprocessor service " + serviceName +
7557               " already registered, rejecting request from " + instance
7558       );
7559       return false;
7560     }
7561
7562     coprocessorServiceHandlers.put(serviceName, instance);
7563     if (LOG.isDebugEnabled()) {
7564       LOG.debug("Registered coprocessor service: region=" +
7565           Bytes.toStringBinary(getRegionInfo().getRegionName()) +
7566           " service=" + serviceName);
7567     }
7568     return true;
7569   }
7570
7571   @Override
7572   public Message execService(RpcController controller, CoprocessorServiceCall call)
7573       throws IOException {
7574     String serviceName = call.getServiceName();
7575     String methodName = call.getMethodName();
7576     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
7577       throw new UnknownProtocolException(null,
7578           "No registered coprocessor service found for name "+serviceName+
7579           " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7580     }
7581
7582     Service service = coprocessorServiceHandlers.get(serviceName);
7583     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
7584     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
7585     if (methodDesc == null) {
7586       throw new UnknownProtocolException(service.getClass(),
7587           "Unknown method "+methodName+" called on service "+serviceName+
7588               " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7589     }
7590
7591     Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType();
7592     ProtobufUtil.mergeFrom(builder, call.getRequest());
7593     Message request = builder.build();
7594
7595     if (coprocessorHost != null) {
7596       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
7597     }
7598
7599     final Message.Builder responseBuilder =
7600         service.getResponsePrototype(methodDesc).newBuilderForType();
7601     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
7602       @Override
7603       public void run(Message message) {
7604         if (message != null) {
7605           responseBuilder.mergeFrom(message);
7606         }
7607       }
7608     });
7609
7610     if (coprocessorHost != null) {
7611       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
7612     }
7613
7614     IOException exception = ResponseConverter.getControllerException(controller);
7615     if (exception != null) {
7616       throw exception;
7617     }
7618
7619     return responseBuilder.build();
7620   }
7621
7622   boolean shouldForceSplit() {
7623     return this.splitRequest;
7624   }
7625
7626   byte[] getExplicitSplitPoint() {
7627     return this.explicitSplitPoint;
7628   }
7629
7630   void forceSplit(byte[] sp) {
7631     // This HRegion will go away after the forced split is successful
7632     // But if a forced split fails, we need to clear forced split.
7633     this.splitRequest = true;
7634     if (sp != null) {
7635       this.explicitSplitPoint = sp;
7636     }
7637   }
7638
7639   void clearSplit() {
7640     this.splitRequest = false;
7641     this.explicitSplitPoint = null;
7642   }
7643
7644   /**
7645    * Give the region a chance to prepare before it is split.
7646    */
7647   protected void prepareToSplit() {
7648     // nothing
7649   }
7650
7651   /**
7652    * Return the splitpoint. null indicates the region isn't splittable
7653    * If the splitpoint isn't explicitly specified, it will go over the stores
7654    * to find the best splitpoint. Currently the criteria of best splitpoint
7655    * is based on the size of the store.
7656    */
7657   public byte[] checkSplit() {
7658     // Can't split META
7659     if (this.getRegionInfo().isMetaTable() ||
7660         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
7661       if (shouldForceSplit()) {
7662         LOG.warn("Cannot split meta region in HBase 0.20 and above");
7663       }
7664       return null;
7665     }
7666
7667     // Can't split region which is in recovering state
7668     if (this.isRecovering()) {
7669       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
7670       return null;
7671     }
7672
7673     if (!splitPolicy.shouldSplit()) {
7674       return null;
7675     }
7676
7677     byte[] ret = splitPolicy.getSplitPoint();
7678
7679     if (ret != null) {
7680       try {
7681         checkRow(ret, "calculated split");
7682       } catch (IOException e) {
7683         LOG.error("Ignoring invalid split", e);
7684         return null;
7685       }
7686     }
7687     return ret;
7688   }
7689
7690   /**
7691    * @return The priority that this region should have in the compaction queue
7692    */
7693   public int getCompactPriority() {
7694     <