View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InterruptedIOException;
25  import java.io.UnsupportedEncodingException;
26  import java.lang.reflect.Constructor;
27  import java.text.ParseException;
28  import java.util.AbstractList;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashMap;
34  import java.util.HashSet;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.NavigableMap;
40  import java.util.NavigableSet;
41  import java.util.RandomAccess;
42  import java.util.Set;
43  import java.util.TreeMap;
44  import java.util.concurrent.Callable;
45  import java.util.concurrent.CompletionService;
46  import java.util.concurrent.ConcurrentHashMap;
47  import java.util.concurrent.ConcurrentMap;
48  import java.util.concurrent.ConcurrentSkipListMap;
49  import java.util.concurrent.CountDownLatch;
50  import java.util.concurrent.ExecutionException;
51  import java.util.concurrent.ExecutorCompletionService;
52  import java.util.concurrent.ExecutorService;
53  import java.util.concurrent.Executors;
54  import java.util.concurrent.Future;
55  import java.util.concurrent.FutureTask;
56  import java.util.concurrent.ThreadFactory;
57  import java.util.concurrent.ThreadPoolExecutor;
58  import java.util.concurrent.TimeUnit;
59  import java.util.concurrent.TimeoutException;
60  import java.util.concurrent.atomic.AtomicBoolean;
61  import java.util.concurrent.atomic.AtomicInteger;
62  import java.util.concurrent.atomic.AtomicLong;
63  import java.util.concurrent.locks.Lock;
64  import java.util.concurrent.locks.ReentrantReadWriteLock;
65  
66  import org.apache.commons.logging.Log;
67  import org.apache.commons.logging.LogFactory;
68  import org.apache.hadoop.conf.Configuration;
69  import org.apache.hadoop.fs.FileStatus;
70  import org.apache.hadoop.fs.FileSystem;
71  import org.apache.hadoop.fs.Path;
72  import org.apache.hadoop.hbase.Cell;
73  import org.apache.hadoop.hbase.CellScanner;
74  import org.apache.hadoop.hbase.CellUtil;
75  import org.apache.hadoop.hbase.CompoundConfiguration;
76  import org.apache.hadoop.hbase.DoNotRetryIOException;
77  import org.apache.hadoop.hbase.DroppedSnapshotException;
78  import org.apache.hadoop.hbase.HBaseConfiguration;
79  import org.apache.hadoop.hbase.HColumnDescriptor;
80  import org.apache.hadoop.hbase.HConstants;
81  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
82  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
83  import org.apache.hadoop.hbase.HRegionInfo;
84  import org.apache.hadoop.hbase.HTableDescriptor;
85  import org.apache.hadoop.hbase.KeyValue;
86  import org.apache.hadoop.hbase.KeyValue.KVComparator;
87  import org.apache.hadoop.hbase.KeyValueUtil;
88  import org.apache.hadoop.hbase.NamespaceDescriptor;
89  import org.apache.hadoop.hbase.NotServingRegionException;
90  import org.apache.hadoop.hbase.RegionTooBusyException;
91  import org.apache.hadoop.hbase.TableName;
92  import org.apache.hadoop.hbase.Tag;
93  import org.apache.hadoop.hbase.TagType;
94  import org.apache.hadoop.hbase.UnknownScannerException;
95  import org.apache.hadoop.hbase.backup.HFileArchiver;
96  import org.apache.hadoop.hbase.classification.InterfaceAudience;
97  import org.apache.hadoop.hbase.client.Append;
98  import org.apache.hadoop.hbase.client.Delete;
99  import org.apache.hadoop.hbase.client.Durability;
100 import org.apache.hadoop.hbase.client.Get;
101 import org.apache.hadoop.hbase.client.Increment;
102 import org.apache.hadoop.hbase.client.IsolationLevel;
103 import org.apache.hadoop.hbase.client.Mutation;
104 import org.apache.hadoop.hbase.client.Put;
105 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
106 import org.apache.hadoop.hbase.client.Result;
107 import org.apache.hadoop.hbase.client.RowMutations;
108 import org.apache.hadoop.hbase.client.Scan;
109 import org.apache.hadoop.hbase.conf.ConfigurationManager;
110 import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
111 import org.apache.hadoop.hbase.coprocessor.RegionObserver;
112 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
113 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
114 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
115 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
116 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
117 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
118 import org.apache.hadoop.hbase.filter.FilterWrapper;
119 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
120 import org.apache.hadoop.hbase.io.HeapSize;
121 import org.apache.hadoop.hbase.io.TimeRange;
122 import org.apache.hadoop.hbase.io.hfile.BlockCache;
123 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
124 import org.apache.hadoop.hbase.io.hfile.HFile;
125 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
126 import org.apache.hadoop.hbase.ipc.RpcCallContext;
127 import org.apache.hadoop.hbase.ipc.RpcServer;
128 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
129 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
130 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
131 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
132 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
133 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
134 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
135 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
136 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
137 import org.apache.hadoop.hbase.protobuf.generated.WALProtos;
138 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
139 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor;
140 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
141 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
142 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor;
143 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
144 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.StoreDescriptor;
145 import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl.WriteEntry;
146 import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
147 import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
148 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
149 import org.apache.hadoop.hbase.regionserver.compactions.CompactionThroughputController;
150 import org.apache.hadoop.hbase.regionserver.compactions.CompactionThroughputControllerFactory;
151 import org.apache.hadoop.hbase.regionserver.compactions.NoLimitCompactionThroughputController;
152 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
153 import org.apache.hadoop.hbase.regionserver.wal.ReplayHLogKey;
154 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
155 import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
156 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
157 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
158 import org.apache.hadoop.hbase.util.ByteStringer;
159 import org.apache.hadoop.hbase.util.Bytes;
160 import org.apache.hadoop.hbase.util.CancelableProgressable;
161 import org.apache.hadoop.hbase.util.ClassSize;
162 import org.apache.hadoop.hbase.util.CompressionTest;
163 import org.apache.hadoop.hbase.util.Counter;
164 import org.apache.hadoop.hbase.util.EncryptionTest;
165 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
166 import org.apache.hadoop.hbase.util.FSTableDescriptors;
167 import org.apache.hadoop.hbase.util.FSUtils;
168 import org.apache.hadoop.hbase.util.HashedBytes;
169 import org.apache.hadoop.hbase.util.Pair;
170 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
171 import org.apache.hadoop.hbase.util.Threads;
172 import org.apache.hadoop.hbase.wal.WAL;
173 import org.apache.hadoop.hbase.wal.WALFactory;
174 import org.apache.hadoop.hbase.wal.WALKey;
175 import org.apache.hadoop.hbase.wal.WALSplitter;
176 import org.apache.hadoop.hbase.wal.WALSplitter.MutationReplay;
177 import org.apache.hadoop.io.MultipleIOException;
178 import org.apache.hadoop.util.StringUtils;
179 import org.apache.htrace.Trace;
180 import org.apache.htrace.TraceScope;
181 
182 import com.google.common.annotations.VisibleForTesting;
183 import com.google.common.base.Optional;
184 import com.google.common.base.Preconditions;
185 import com.google.common.collect.Lists;
186 import com.google.common.collect.Maps;
187 import com.google.common.io.Closeables;
188 import com.google.protobuf.ByteString;
189 import com.google.protobuf.Descriptors;
190 import com.google.protobuf.Message;
191 import com.google.protobuf.RpcCallback;
192 import com.google.protobuf.RpcController;
193 import com.google.protobuf.Service;
194 import com.google.protobuf.TextFormat;
195 
196 @InterfaceAudience.Private
197 public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
198   public static final Log LOG = LogFactory.getLog(HRegion.class);
199 
200   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
201       "hbase.hregion.scan.loadColumnFamiliesOnDemand";
202 
203   /**
204    * This is the global default value for durability. All tables/mutations not
205    * defining a durability or using USE_DEFAULT will default to this value.
206    */
207   private static final Durability DEFAULT_DURABILITY = Durability.SYNC_WAL;
208 
209   final AtomicBoolean closed = new AtomicBoolean(false);
210   /* Closing can take some time; use the closing flag if there is stuff we don't
211    * want to do while in closing state; e.g. like offer this region up to the
212    * master as a region to close if the carrying regionserver is overloaded.
213    * Once set, it is never cleared.
214    */
215   final AtomicBoolean closing = new AtomicBoolean(false);
216 
217   /**
218    * The max sequence id of flushed data on this region.  Used doing some rough calculations on
219    * whether time to flush or not.
220    */
221   private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
222 
223   /**
224    * Record the sequence id of last flush operation.
225    */
226   private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
227   /**
228    * Region scoped edit sequence Id. Edits to this region are GUARANTEED to appear in the WAL
229    * file in this sequence id's order; i.e. edit #2 will be in the WAL after edit #1.
230    * Its default value is -1L. This default is used as a marker to indicate
231    * that the region hasn't opened yet. Once it is opened, it is set to the derived
232    * {@link #openSeqNum}, the largest sequence id of all hfiles opened under this Region.
233    *
234    * <p>Control of this sequence is handed off to the WAL implementation.  It is responsible
235    * for tagging edits with the correct sequence id since it is responsible for getting the
236    * edits into the WAL files. It controls updating the sequence id value.  DO NOT UPDATE IT
237    * OUTSIDE OF THE WAL.  The value you get will not be what you think it is.
238    */
239   private final AtomicLong sequenceId = new AtomicLong(-1L);
240 
241   /**
242    * The sequence id of the last replayed open region event from the primary region. This is used
243    * to skip entries before this due to the possibility of replay edits coming out of order from
244    * replication.
245    */
246   protected volatile long lastReplayedOpenRegionSeqId = -1L;
247   protected volatile long lastReplayedCompactionSeqId = -1L;
248 
249   //////////////////////////////////////////////////////////////////////////////
250   // Members
251   //////////////////////////////////////////////////////////////////////////////
252 
253   // map from a locked row to the context for that lock including:
254   // - CountDownLatch for threads waiting on that row
255   // - the thread that owns the lock (allow reentrancy)
256   // - reference count of (reentrant) locks held by the thread
257   // - the row itself
258   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
259       new ConcurrentHashMap<HashedBytes, RowLockContext>();
260 
261   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
262       Bytes.BYTES_RAWCOMPARATOR);
263 
264   // TODO: account for each registered handler in HeapSize computation
265   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
266 
267   public final AtomicLong memstoreSize = new AtomicLong(0);
268 
269   // Debug possible data loss due to WAL off
270   final Counter numMutationsWithoutWAL = new Counter();
271   final Counter dataInMemoryWithoutWAL = new Counter();
272 
273   // Debug why CAS operations are taking a while.
274   final Counter checkAndMutateChecksPassed = new Counter();
275   final Counter checkAndMutateChecksFailed = new Counter();
276 
277   //Number of requests
278   final Counter readRequestsCount = new Counter();
279   final Counter writeRequestsCount = new Counter();
280 
281   // Number of requests blocked by memstore size.
282   private final Counter blockedRequestsCount = new Counter();
283 
284   // Compaction counters
285   final AtomicLong compactionsFinished = new AtomicLong(0L);
286   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
287   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
288 
289   private final WAL wal;
290   private final HRegionFileSystem fs;
291   protected final Configuration conf;
292   private final Configuration baseConf;
293   private final int rowLockWaitDuration;
294   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
295 
296   // The internal wait duration to acquire a lock before read/update
297   // from the region. It is not per row. The purpose of this wait time
298   // is to avoid waiting a long time while the region is busy, so that
299   // we can release the IPC handler soon enough to improve the
300   // availability of the region server. It can be adjusted by
301   // tuning configuration "hbase.busy.wait.duration".
302   final long busyWaitDuration;
303   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
304 
305   // If updating multiple rows in one call, wait longer,
306   // i.e. waiting for busyWaitDuration * # of rows. However,
307   // we can limit the max multiplier.
308   final int maxBusyWaitMultiplier;
309 
310   // Max busy wait duration. There is no point to wait longer than the RPC
311   // purge timeout, when a RPC call will be terminated by the RPC engine.
312   final long maxBusyWaitDuration;
313 
314   // negative number indicates infinite timeout
315   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
316   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
317 
318   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
319 
320   /**
321    * The sequence ID that was encountered when this region was opened.
322    */
323   private long openSeqNum = HConstants.NO_SEQNUM;
324 
325   /**
326    * The default setting for whether to enable on-demand CF loading for
327    * scan requests to this region. Requests can override it.
328    */
329   private boolean isLoadingCfsOnDemandDefault = false;
330 
331   private final AtomicInteger majorInProgress = new AtomicInteger(0);
332   private final AtomicInteger minorInProgress = new AtomicInteger(0);
333 
334   //
335   // Context: During replay we want to ensure that we do not lose any data. So, we
336   // have to be conservative in how we replay wals. For each store, we calculate
337   // the maxSeqId up to which the store was flushed. And, skip the edits which
338   // are equal to or lower than maxSeqId for each store.
339   // The following map is populated when opening the region
340   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
341 
342   /** Saved state from replaying prepare flush cache */
343   private PrepareFlushResult prepareFlushResult = null;
344 
345   /**
346    * Config setting for whether to allow writes when a region is in recovering or not.
347    */
348   private boolean disallowWritesInRecovering = false;
349 
350   // when a region is in recovering state, it can only accept writes not reads
351   private volatile boolean recovering = false;
352 
353   private volatile Optional<ConfigurationManager> configurationManager;
354 
355   /**
356    * @return The smallest mvcc readPoint across all the scanners in this
357    * region. Writes older than this readPoint, are included  in every
358    * read operation.
359    */
360   public long getSmallestReadPoint() {
361     long minimumReadPoint;
362     // We need to ensure that while we are calculating the smallestReadPoint
363     // no new RegionScanners can grab a readPoint that we are unaware of.
364     // We achieve this by synchronizing on the scannerReadPoints object.
365     synchronized(scannerReadPoints) {
366       minimumReadPoint = mvcc.memstoreReadPoint();
367 
368       for (Long readPoint: this.scannerReadPoints.values()) {
369         if (readPoint < minimumReadPoint) {
370           minimumReadPoint = readPoint;
371         }
372       }
373     }
374     return minimumReadPoint;
375   }
376 
377   /*
378    * Data structure of write state flags used coordinating flushes,
379    * compactions and closes.
380    */
381   static class WriteState {
382     // Set while a memstore flush is happening.
383     volatile boolean flushing = false;
384     // Set when a flush has been requested.
385     volatile boolean flushRequested = false;
386     // Number of compactions running.
387     volatile int compacting = 0;
388     // Gets set in close. If set, cannot compact or flush again.
389     volatile boolean writesEnabled = true;
390     // Set if region is read-only
391     volatile boolean readOnly = false;
392     // whether the reads are enabled. This is different than readOnly, because readOnly is
393     // static in the lifetime of the region, while readsEnabled is dynamic
394     volatile boolean readsEnabled = true;
395 
396     /**
397      * Set flags that make this region read-only.
398      *
399      * @param onOff flip value for region r/o setting
400      */
401     synchronized void setReadOnly(final boolean onOff) {
402       this.writesEnabled = !onOff;
403       this.readOnly = onOff;
404     }
405 
406     boolean isReadOnly() {
407       return this.readOnly;
408     }
409 
410     boolean isFlushRequested() {
411       return this.flushRequested;
412     }
413 
414     void setReadsEnabled(boolean readsEnabled) {
415       this.readsEnabled = readsEnabled;
416     }
417 
418     static final long HEAP_SIZE = ClassSize.align(
419         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
420   }
421 
422   /**
423    * Objects from this class are created when flushing to describe all the different states that
424    * that method ends up in. The Result enum describes those states. The sequence id should only
425    * be specified if the flush was successful, and the failure message should only be specified
426    * if it didn't flush.
427    */
428   public static class FlushResultImpl implements FlushResult {
429     final Result result;
430     final String failureReason;
431     final long flushSequenceId;
432     final boolean wroteFlushWalMarker;
433 
434     /**
435      * Convenience constructor to use when the flush is successful, the failure message is set to
436      * null.
437      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
438      * @param flushSequenceId Generated sequence id that comes right after the edits in the
439      *                        memstores.
440      */
441     FlushResultImpl(Result result, long flushSequenceId) {
442       this(result, flushSequenceId, null, false);
443       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
444           .FLUSHED_COMPACTION_NEEDED;
445     }
446 
447     /**
448      * Convenience constructor to use when we cannot flush.
449      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
450      * @param failureReason Reason why we couldn't flush.
451      */
452     FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
453       this(result, -1, failureReason, wroteFlushMarker);
454       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
455     }
456 
457     /**
458      * Constructor with all the parameters.
459      * @param result Any of the Result.
460      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
461      * @param failureReason Reason why we couldn't flush, or null.
462      */
463     FlushResultImpl(Result result, long flushSequenceId, String failureReason,
464       boolean wroteFlushMarker) {
465       this.result = result;
466       this.flushSequenceId = flushSequenceId;
467       this.failureReason = failureReason;
468       this.wroteFlushWalMarker = wroteFlushMarker;
469     }
470 
471     /**
472      * Convenience method, the equivalent of checking if result is
473      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
474      * @return true if the memstores were flushed, else false.
475      */
476     public boolean isFlushSucceeded() {
477       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
478           .FLUSHED_COMPACTION_NEEDED;
479     }
480 
481     /**
482      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
483      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
484      */
485     public boolean isCompactionNeeded() {
486       return result == Result.FLUSHED_COMPACTION_NEEDED;
487     }
488 
489     @Override
490     public String toString() {
491       return new StringBuilder()
492         .append("flush result:").append(result).append(", ")
493         .append("failureReason:").append(failureReason).append(",")
494         .append("flush seq id").append(flushSequenceId).toString();
495     }
496 
497     @Override
498     public Result getResult() {
499       return result;
500     }
501   }
502 
503   /** A result object from prepare flush cache stage */
504   @VisibleForTesting
505   static class PrepareFlushResult {
506     final FlushResult result; // indicating a failure result from prepare
507     final TreeMap<byte[], StoreFlushContext> storeFlushCtxs;
508     final TreeMap<byte[], List<Path>> committedFiles;
509     final long startTime;
510     final long flushOpSeqId;
511     final long flushedSeqId;
512     final long totalFlushableSize;
513 
514     /** Constructs an early exit case */
515     PrepareFlushResult(FlushResult result, long flushSeqId) {
516       this(result, null, null, Math.max(0, flushSeqId), 0, 0, 0);
517     }
518 
519     /** Constructs a successful prepare flush result */
520     PrepareFlushResult(
521       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
522       TreeMap<byte[], List<Path>> committedFiles, long startTime, long flushSeqId,
523       long flushedSeqId, long totalFlushableSize) {
524       this(null, storeFlushCtxs, committedFiles, startTime,
525         flushSeqId, flushedSeqId, totalFlushableSize);
526     }
527 
528     private PrepareFlushResult(
529       FlushResult result,
530       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
531       TreeMap<byte[], List<Path>> committedFiles, long startTime, long flushSeqId,
532       long flushedSeqId, long totalFlushableSize) {
533       this.result = result;
534       this.storeFlushCtxs = storeFlushCtxs;
535       this.committedFiles = committedFiles;
536       this.startTime = startTime;
537       this.flushOpSeqId = flushSeqId;
538       this.flushedSeqId = flushedSeqId;
539       this.totalFlushableSize = totalFlushableSize;
540     }
541 
542     public FlushResult getResult() {
543       return this.result;
544     }
545   }
546 
547   final WriteState writestate = new WriteState();
548 
549   long memstoreFlushSize;
550   final long timestampSlop;
551   final long rowProcessorTimeout;
552 
553   // Last flush time for each Store. Useful when we are flushing for each column
554   private final ConcurrentMap<Store, Long> lastStoreFlushTimeMap =
555       new ConcurrentHashMap<Store, Long>();
556 
557   final RegionServerServices rsServices;
558   private RegionServerAccounting rsAccounting;
559   private long flushCheckInterval;
560   // flushPerChanges is to prevent too many changes in memstore
561   private long flushPerChanges;
562   private long blockingMemStoreSize;
563   final long threadWakeFrequency;
564   // Used to guard closes
565   final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
566 
567   // Stop updates lock
568   private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
569   private boolean splitRequest;
570   private byte[] explicitSplitPoint = null;
571 
572   private final MultiVersionConsistencyControl mvcc =
573       new MultiVersionConsistencyControl();
574 
575   // Coprocessor host
576   private RegionCoprocessorHost coprocessorHost;
577 
578   private HTableDescriptor htableDescriptor = null;
579   private RegionSplitPolicy splitPolicy;
580   private FlushPolicy flushPolicy;
581 
582   private final MetricsRegion metricsRegion;
583   private final MetricsRegionWrapperImpl metricsRegionWrapper;
584   private final Durability durability;
585   private final boolean regionStatsEnabled;
586 
587   /**
588    * HRegion constructor. This constructor should only be used for testing and
589    * extensions.  Instances of HRegion should be instantiated with the
590    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
591    *
592    * @param tableDir qualified path of directory where region should be located,
593    * usually the table directory.
594    * @param wal The WAL is the outbound log for any updates to the HRegion
595    * The wal file is a logfile from the previous execution that's
596    * custom-computed for this HRegion. The HRegionServer computes and sorts the
597    * appropriate wal info for this HRegion. If there is a previous wal file
598    * (implying that the HRegion has been written-to before), then read it from
599    * the supplied path.
600    * @param fs is the filesystem.
601    * @param confParam is global configuration settings.
602    * @param regionInfo - HRegionInfo that describes the region
603    * is new), then read them from the supplied path.
604    * @param htd the table descriptor
605    * @param rsServices reference to {@link RegionServerServices} or null
606    */
607   @Deprecated
608   public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
609       final Configuration confParam, final HRegionInfo regionInfo,
610       final HTableDescriptor htd, final RegionServerServices rsServices) {
611     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
612       wal, confParam, htd, rsServices);
613   }
614 
615   /**
616    * HRegion constructor. This constructor should only be used for testing and
617    * extensions.  Instances of HRegion should be instantiated with the
618    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
619    *
620    * @param fs is the filesystem.
621    * @param wal The WAL is the outbound log for any updates to the HRegion
622    * The wal file is a logfile from the previous execution that's
623    * custom-computed for this HRegion. The HRegionServer computes and sorts the
624    * appropriate wal info for this HRegion. If there is a previous wal file
625    * (implying that the HRegion has been written-to before), then read it from
626    * the supplied path.
627    * @param confParam is global configuration settings.
628    * @param htd the table descriptor
629    * @param rsServices reference to {@link RegionServerServices} or null
630    */
631   public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
632       final HTableDescriptor htd, final RegionServerServices rsServices) {
633     if (htd == null) {
634       throw new IllegalArgumentException("Need table descriptor");
635     }
636 
637     if (confParam instanceof CompoundConfiguration) {
638       throw new IllegalArgumentException("Need original base configuration");
639     }
640 
641     this.wal = wal;
642     this.fs = fs;
643 
644     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
645     this.baseConf = confParam;
646     this.conf = new CompoundConfiguration()
647       .add(confParam)
648       .addStringMap(htd.getConfiguration())
649       .addBytesMap(htd.getValues());
650     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
651         DEFAULT_CACHE_FLUSH_INTERVAL);
652     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
653     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
654       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
655           + MAX_FLUSH_PER_CHANGES);
656     }
657     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
658                     DEFAULT_ROWLOCK_WAIT_DURATION);
659 
660     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
661     this.htableDescriptor = htd;
662     this.rsServices = rsServices;
663     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
664     setHTableSpecificConf();
665     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
666 
667     this.busyWaitDuration = conf.getLong(
668       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
669     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
670     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
671       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
672         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
673         + maxBusyWaitMultiplier + "). Their product should be positive");
674     }
675     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
676       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
677 
678     /*
679      * timestamp.slop provides a server-side constraint on the timestamp. This
680      * assumes that you base your TS around currentTimeMillis(). In this case,
681      * throw an error to the user if the user-specified TS is newer than now +
682      * slop. LATEST_TIMESTAMP == don't use this functionality
683      */
684     this.timestampSlop = conf.getLong(
685         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
686         HConstants.LATEST_TIMESTAMP);
687 
688     /**
689      * Timeout for the process time in processRowsWithLocks().
690      * Use -1 to switch off time bound.
691      */
692     this.rowProcessorTimeout = conf.getLong(
693         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
694     this.durability = htd.getDurability() == Durability.USE_DEFAULT
695         ? DEFAULT_DURABILITY
696         : htd.getDurability();
697     if (rsServices != null) {
698       this.rsAccounting = this.rsServices.getRegionServerAccounting();
699       // don't initialize coprocessors if not running within a regionserver
700       // TODO: revisit if coprocessors should load in other cases
701       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
702       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
703       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
704 
705       Map<String, Region> recoveringRegions = rsServices.getRecoveringRegions();
706       String encodedName = getRegionInfo().getEncodedName();
707       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
708         this.recovering = true;
709         recoveringRegions.put(encodedName, this);
710       }
711     } else {
712       this.metricsRegionWrapper = null;
713       this.metricsRegion = null;
714     }
715     if (LOG.isDebugEnabled()) {
716       // Write out region name as string and its encoded name.
717       LOG.debug("Instantiated " + this);
718     }
719 
720     // by default, we allow writes against a region when it's in recovering
721     this.disallowWritesInRecovering =
722         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
723           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
724     configurationManager = Optional.absent();
725 
726     // disable stats tracking system tables, but check the config for everything else
727     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
728         NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
729           false :
730           conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
731               HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
732   }
733 
734   void setHTableSpecificConf() {
735     if (this.htableDescriptor == null) return;
736     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
737 
738     if (flushSize <= 0) {
739       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
740         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
741     }
742     this.memstoreFlushSize = flushSize;
743     this.blockingMemStoreSize = this.memstoreFlushSize *
744         conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
745   }
746 
747   /**
748    * Initialize this region.
749    * Used only by tests and SplitTransaction to reopen the region.
750    * You should use createHRegion() or openHRegion()
751    * @return What the next sequence (edit) id should be.
752    * @throws IOException e
753    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
754    */
755   @Deprecated
756   public long initialize() throws IOException {
757     return initialize(null);
758   }
759 
760   /**
761    * Initialize this region.
762    *
763    * @param reporter Tickle every so often if initialize is taking a while.
764    * @return What the next sequence (edit) id should be.
765    * @throws IOException e
766    */
767   private long initialize(final CancelableProgressable reporter) throws IOException {
768     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
769     long nextSeqId = -1;
770     try {
771       nextSeqId = initializeRegionInternals(reporter, status);
772       return nextSeqId;
773     } finally {
774       // nextSeqid will be -1 if the initialization fails.
775       // At least it will be 0 otherwise.
776       if (nextSeqId == -1) {
777         status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() +
778           " initialization.");
779       }
780     }
781   }
782 
783   private long initializeRegionInternals(final CancelableProgressable reporter,
784       final MonitoredTask status) throws IOException {
785     if (coprocessorHost != null) {
786       status.setStatus("Running coprocessor pre-open hook");
787       coprocessorHost.preOpen();
788     }
789 
790     // Write HRI to a file in case we need to recover hbase:meta
791     status.setStatus("Writing region info on filesystem");
792     fs.checkRegionInfoOnFilesystem();
793 
794     // Initialize all the HStores
795     status.setStatus("Initializing all the Stores");
796     long maxSeqId = initializeRegionStores(reporter, status, false);
797     this.lastReplayedOpenRegionSeqId = maxSeqId;
798 
799     this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
800     this.writestate.flushRequested = false;
801     this.writestate.compacting = 0;
802 
803     if (this.writestate.writesEnabled) {
804       // Remove temporary data left over from old regions
805       status.setStatus("Cleaning up temporary data from old regions");
806       fs.cleanupTempDir();
807     }
808 
809     if (this.writestate.writesEnabled) {
810       status.setStatus("Cleaning up detritus from prior splits");
811       // Get rid of any splits or merges that were lost in-progress.  Clean out
812       // these directories here on open.  We may be opening a region that was
813       // being split but we crashed in the middle of it all.
814       fs.cleanupAnySplitDetritus();
815       fs.cleanupMergesDir();
816     }
817 
818     // Initialize split policy
819     this.splitPolicy = RegionSplitPolicy.create(this, conf);
820 
821     // Initialize flush policy
822     this.flushPolicy = FlushPolicyFactory.create(this, conf);
823 
824     long lastFlushTime = EnvironmentEdgeManager.currentTime();
825     for (Store store: stores.values()) {
826       this.lastStoreFlushTimeMap.put(store, lastFlushTime);
827     }
828 
829     // Use maximum of log sequenceid or that which was found in stores
830     // (particularly if no recovered edits, seqid will be -1).
831     long nextSeqid = maxSeqId;
832 
833     // In distributedLogReplay mode, we don't know the last change sequence number because region
834     // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
835     // overlaps used sequence numbers
836     if (this.writestate.writesEnabled) {
837       nextSeqid = WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs
838           .getRegionDir(), nextSeqid, (this.recovering ? (this.flushPerChanges + 10000000) : 1));
839     } else {
840       nextSeqid++;
841     }
842 
843     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
844       "; next sequenceid=" + nextSeqid);
845 
846     // A region can be reopened if failed a split; reset flags
847     this.closing.set(false);
848     this.closed.set(false);
849 
850     if (coprocessorHost != null) {
851       status.setStatus("Running coprocessor post-open hooks");
852       coprocessorHost.postOpen();
853     }
854 
855     status.markComplete("Region opened successfully");
856     return nextSeqid;
857   }
858 
859   private long initializeRegionStores(final CancelableProgressable reporter, MonitoredTask status,
860       boolean warmupOnly)
861       throws IOException {
862 
863     // Load in all the HStores.
864 
865     long maxSeqId = -1;
866     // initialized to -1 so that we pick up MemstoreTS from column families
867     long maxMemstoreTS = -1;
868 
869     if (!htableDescriptor.getFamilies().isEmpty()) {
870       // initialize the thread pool for opening stores in parallel.
871       ThreadPoolExecutor storeOpenerThreadPool =
872         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
873       CompletionService<HStore> completionService =
874         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
875 
876       // initialize each store in parallel
877       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
878         status.setStatus("Instantiating store for column family " + family);
879         completionService.submit(new Callable<HStore>() {
880           @Override
881           public HStore call() throws IOException {
882             return instantiateHStore(family);
883           }
884         });
885       }
886       boolean allStoresOpened = false;
887       try {
888         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
889           Future<HStore> future = completionService.take();
890           HStore store = future.get();
891           this.stores.put(store.getFamily().getName(), store);
892 
893           long storeMaxSequenceId = store.getMaxSequenceId();
894           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
895               storeMaxSequenceId);
896           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
897             maxSeqId = storeMaxSequenceId;
898           }
899           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
900           if (maxStoreMemstoreTS > maxMemstoreTS) {
901             maxMemstoreTS = maxStoreMemstoreTS;
902           }
903         }
904         allStoresOpened = true;
905       } catch (InterruptedException e) {
906         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
907       } catch (ExecutionException e) {
908         throw new IOException(e.getCause());
909       } finally {
910         storeOpenerThreadPool.shutdownNow();
911         if (!allStoresOpened) {
912           // something went wrong, close all opened stores
913           LOG.error("Could not initialize all stores for the region=" + this);
914           for (Store store : this.stores.values()) {
915             try {
916               store.close();
917             } catch (IOException e) {
918               LOG.warn(e.getMessage());
919             }
920           }
921         }
922       }
923     }
924     if (ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this) && !warmupOnly) {
925       // Recover any edits if available.
926       maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
927           this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
928     }
929     maxSeqId = Math.max(maxSeqId, maxMemstoreTS + 1);
930     mvcc.initialize(maxSeqId);
931     return maxSeqId;
932   }
933 
934   private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
935     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
936 
937     // Initialize all the HStores
938     status.setStatus("Warming up all the Stores");
939     initializeRegionStores(reporter, status, true);
940   }
941 
942   private void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
943     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
944     for (Store store: getStores()) {
945       ArrayList<Path> storeFileNames = new ArrayList<Path>();
946       for (StoreFile storeFile: store.getStorefiles()) {
947         storeFileNames.add(storeFile.getPath());
948       }
949       storeFiles.put(store.getFamily().getName(), storeFileNames);
950     }
951 
952     RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
953       RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
954       getRegionServerServices().getServerName(), storeFiles);
955     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionOpenDesc,
956       getSequenceId());
957   }
958 
959   private void writeRegionCloseMarker(WAL wal) throws IOException {
960     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
961     for (Store store: getStores()) {
962       ArrayList<Path> storeFileNames = new ArrayList<Path>();
963       for (StoreFile storeFile: store.getStorefiles()) {
964         storeFileNames.add(storeFile.getPath());
965       }
966       storeFiles.put(store.getFamily().getName(), storeFileNames);
967     }
968 
969     RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
970       RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), getSequenceId().get(),
971       getRegionServerServices().getServerName(), storeFiles);
972     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionEventDesc,
973       getSequenceId());
974 
975     // Store SeqId in HDFS when a region closes
976     // checking region folder exists is due to many tests which delete the table folder while a
977     // table is still online
978     if (this.fs.getFileSystem().exists(this.fs.getRegionDir())) {
979       WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs.getRegionDir(),
980         getSequenceId().get(), 0);
981     }
982   }
983 
984   /**
985    * @return True if this region has references.
986    */
987   public boolean hasReferences() {
988     for (Store store : this.stores.values()) {
989       if (store.hasReferences()) return true;
990     }
991     return false;
992   }
993 
994   @Override
995   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
996     HDFSBlocksDistribution hdfsBlocksDistribution =
997       new HDFSBlocksDistribution();
998     synchronized (this.stores) {
999       for (Store store : this.stores.values()) {
1000         for (StoreFile sf : store.getStorefiles()) {
1001           HDFSBlocksDistribution storeFileBlocksDistribution =
1002             sf.getHDFSBlockDistribution();
1003           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
1004         }
1005       }
1006     }
1007     return hdfsBlocksDistribution;
1008   }
1009 
1010   /**
1011    * This is a helper function to compute HDFS block distribution on demand
1012    * @param conf configuration
1013    * @param tableDescriptor HTableDescriptor of the table
1014    * @param regionInfo encoded name of the region
1015    * @return The HDFS blocks distribution for the given region.
1016    * @throws IOException
1017    */
1018   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1019       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
1020     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
1021     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
1022   }
1023 
1024   /**
1025    * This is a helper function to compute HDFS block distribution on demand
1026    * @param conf configuration
1027    * @param tableDescriptor HTableDescriptor of the table
1028    * @param regionInfo encoded name of the region
1029    * @param tablePath the table directory
1030    * @return The HDFS blocks distribution for the given region.
1031    * @throws IOException
1032    */
1033   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1034       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
1035       throws IOException {
1036     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1037     FileSystem fs = tablePath.getFileSystem(conf);
1038 
1039     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
1040     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
1041       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
1042       if (storeFiles == null) continue;
1043 
1044       for (StoreFileInfo storeFileInfo : storeFiles) {
1045         hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
1046       }
1047     }
1048     return hdfsBlocksDistribution;
1049   }
1050 
1051   /**
1052    * Increase the size of mem store in this region and the size of global mem
1053    * store
1054    * @return the size of memstore in this region
1055    */
1056   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
1057     if (this.rsAccounting != null) {
1058       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
1059     }
1060     return this.memstoreSize.addAndGet(memStoreSize);
1061   }
1062 
1063   @Override
1064   public HRegionInfo getRegionInfo() {
1065     return this.fs.getRegionInfo();
1066   }
1067 
1068   /**
1069    * @return Instance of {@link RegionServerServices} used by this HRegion.
1070    * Can be null.
1071    */
1072   RegionServerServices getRegionServerServices() {
1073     return this.rsServices;
1074   }
1075 
1076   @Override
1077   public long getReadRequestsCount() {
1078     return readRequestsCount.get();
1079   }
1080 
1081   @Override
1082   public void updateReadRequestsCount(long i) {
1083     readRequestsCount.add(i);
1084   }
1085 
1086   @Override
1087   public long getWriteRequestsCount() {
1088     return writeRequestsCount.get();
1089   }
1090 
1091   @Override
1092   public void updateWriteRequestsCount(long i) {
1093     writeRequestsCount.add(i);
1094   }
1095 
1096   @Override
1097   public long getMemstoreSize() {
1098     return memstoreSize.get();
1099   }
1100 
1101   @Override
1102   public long getNumMutationsWithoutWAL() {
1103     return numMutationsWithoutWAL.get();
1104   }
1105   
1106   @Override
1107   public long getDataInMemoryWithoutWAL() {
1108     return dataInMemoryWithoutWAL.get();
1109   }
1110 
1111   @Override
1112   public long getBlockedRequestsCount() {
1113     return blockedRequestsCount.get();
1114   }
1115 
1116   @Override
1117   public long getCheckAndMutateChecksPassed() {
1118     return checkAndMutateChecksPassed.get();
1119   }
1120 
1121   @Override
1122   public long getCheckAndMutateChecksFailed() {
1123     return checkAndMutateChecksFailed.get();
1124   }
1125 
1126   @Override
1127   public MetricsRegion getMetrics() {
1128     return metricsRegion;
1129   }
1130 
1131   @Override
1132   public boolean isClosed() {
1133     return this.closed.get();
1134   }
1135 
1136   @Override
1137   public boolean isClosing() {
1138     return this.closing.get();
1139   }
1140 
1141   @Override
1142   public boolean isReadOnly() {
1143     return this.writestate.isReadOnly();
1144   }
1145 
1146   /**
1147    * Reset recovering state of current region
1148    */
1149   public void setRecovering(boolean newState) {
1150     boolean wasRecovering = this.recovering;
1151     // before we flip the recovering switch (enabling reads) we should write the region open
1152     // event to WAL if needed
1153     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
1154         && wasRecovering && !newState) {
1155 
1156       // force a flush only if region replication is set up for this region. Otherwise no need.
1157       boolean forceFlush = getTableDesc().getRegionReplication() > 1;
1158 
1159       // force a flush first
1160       MonitoredTask status = TaskMonitor.get().createStatus(
1161         "Flushing region " + this + " because recovery is finished");
1162       try {
1163         if (forceFlush) {
1164           internalFlushcache(status);
1165         }
1166 
1167         status.setStatus("Writing region open event marker to WAL because recovery is finished");
1168         try {
1169           long seqId = openSeqNum;
1170           // obtain a new seqId because we possibly have writes and flushes on top of openSeqNum
1171           if (wal != null) {
1172             seqId = getNextSequenceId(wal);
1173           }
1174           writeRegionOpenMarker(wal, seqId);
1175         } catch (IOException e) {
1176           // We cannot rethrow this exception since we are being called from the zk thread. The
1177           // region has already opened. In this case we log the error, but continue
1178           LOG.warn(getRegionInfo().getEncodedName() + " : was not able to write region opening "
1179               + "event to WAL, continueing", e);
1180         }
1181       } catch (IOException ioe) {
1182         // Distributed log replay semantics does not necessarily require a flush, since the replayed
1183         // data is already written again in the WAL. So failed flush should be fine.
1184         LOG.warn(getRegionInfo().getEncodedName() + " : was not able to flush "
1185             + "event to WAL, continueing", ioe);
1186       } finally {
1187         status.cleanup();
1188       }
1189     }
1190 
1191     this.recovering = newState;
1192     if (wasRecovering && !recovering) {
1193       // Call only when wal replay is over.
1194       coprocessorHost.postLogReplay();
1195     }
1196   }
1197 
1198   @Override
1199   public boolean isRecovering() {
1200     return this.recovering;
1201   }
1202 
1203   @Override
1204   public boolean isAvailable() {
1205     return !isClosed() && !isClosing();
1206   }
1207 
1208   /** @return true if region is splittable */
1209   public boolean isSplittable() {
1210     return isAvailable() && !hasReferences();
1211   }
1212 
1213   /**
1214    * @return true if region is mergeable
1215    */
1216   public boolean isMergeable() {
1217     if (!isAvailable()) {
1218       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1219           + " is not mergeable because it is closing or closed");
1220       return false;
1221     }
1222     if (hasReferences()) {
1223       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1224           + " is not mergeable because it has references");
1225       return false;
1226     }
1227 
1228     return true;
1229   }
1230 
1231   public boolean areWritesEnabled() {
1232     synchronized(this.writestate) {
1233       return this.writestate.writesEnabled;
1234     }
1235   }
1236 
1237    public MultiVersionConsistencyControl getMVCC() {
1238      return mvcc;
1239    }
1240 
1241    @Override
1242    public long getMaxFlushedSeqId() {
1243      return maxFlushedSeqId;
1244    }
1245 
1246    @Override
1247    public long getReadpoint(IsolationLevel isolationLevel) {
1248      if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1249        // This scan can read even uncommitted transactions
1250        return Long.MAX_VALUE;
1251      }
1252      return mvcc.memstoreReadPoint();
1253    }
1254 
1255    @Override
1256    public boolean isLoadingCfsOnDemandDefault() {
1257      return this.isLoadingCfsOnDemandDefault;
1258    }
1259 
1260   /**
1261    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1262    * service any more calls.
1263    *
1264    * <p>This method could take some time to execute, so don't call it from a
1265    * time-sensitive thread.
1266    *
1267    * @return Vector of all the storage files that the HRegion's component
1268    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1269    * vector if already closed and null if judged that it should not close.
1270    *
1271    * @throws IOException e
1272    */
1273   public Map<byte[], List<StoreFile>> close() throws IOException {
1274     return close(false);
1275   }
1276 
1277   private final Object closeLock = new Object();
1278 
1279   /** Conf key for the periodic flush interval */
1280   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1281       "hbase.regionserver.optionalcacheflushinterval";
1282   /** Default interval for the memstore flush */
1283   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1284   public static final int META_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
1285 
1286   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1287   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1288       "hbase.regionserver.flush.per.changes";
1289   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1290   /**
1291    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1292    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1293    */
1294   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1295 
1296   /**
1297    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1298    * Shut down each HStore, don't service any more calls.
1299    *
1300    * This method could take some time to execute, so don't call it from a
1301    * time-sensitive thread.
1302    *
1303    * @param abort true if server is aborting (only during testing)
1304    * @return Vector of all the storage files that the HRegion's component
1305    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1306    * we are not to close at this time or we are already closed.
1307    *
1308    * @throws IOException e
1309    */
1310   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1311     // Only allow one thread to close at a time. Serialize them so dual
1312     // threads attempting to close will run up against each other.
1313     MonitoredTask status = TaskMonitor.get().createStatus(
1314         "Closing region " + this +
1315         (abort ? " due to abort" : ""));
1316 
1317     status.setStatus("Waiting for close lock");
1318     try {
1319       synchronized (closeLock) {
1320         return doClose(abort, status);
1321       }
1322     } finally {
1323       status.cleanup();
1324     }
1325   }
1326 
1327   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1328       throws IOException {
1329     if (isClosed()) {
1330       LOG.warn("Region " + this + " already closed");
1331       return null;
1332     }
1333 
1334     if (coprocessorHost != null) {
1335       status.setStatus("Running coprocessor pre-close hooks");
1336       this.coprocessorHost.preClose(abort);
1337     }
1338 
1339     status.setStatus("Disabling compacts and flushes for region");
1340     boolean canFlush = true;
1341     synchronized (writestate) {
1342       // Disable compacting and flushing by background threads for this
1343       // region.
1344       canFlush = !writestate.readOnly;
1345       writestate.writesEnabled = false;
1346       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1347       waitForFlushesAndCompactions();
1348     }
1349     // If we were not just flushing, is it worth doing a preflush...one
1350     // that will clear out of the bulk of the memstore before we put up
1351     // the close flag?
1352     if (!abort && worthPreFlushing() && canFlush) {
1353       status.setStatus("Pre-flushing region before close");
1354       LOG.info("Running close preflush of " + getRegionInfo().getRegionNameAsString());
1355       try {
1356         internalFlushcache(status);
1357       } catch (IOException ioe) {
1358         // Failed to flush the region. Keep going.
1359         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1360       }
1361     }
1362 
1363     this.closing.set(true);
1364     status.setStatus("Disabling writes for close");
1365     // block waiting for the lock for closing
1366     lock.writeLock().lock();
1367     try {
1368       if (this.isClosed()) {
1369         status.abort("Already got closed by another process");
1370         // SplitTransaction handles the null
1371         return null;
1372       }
1373       LOG.debug("Updates disabled for region " + this);
1374       // Don't flush the cache if we are aborting
1375       if (!abort && canFlush) {
1376         int flushCount = 0;
1377         while (this.memstoreSize.get() > 0) {
1378           try {
1379             if (flushCount++ > 0) {
1380               int actualFlushes = flushCount - 1;
1381               if (actualFlushes > 5) {
1382                 // If we tried 5 times and are unable to clear memory, abort
1383                 // so we do not lose data
1384                 throw new DroppedSnapshotException("Failed clearing memory after " +
1385                   actualFlushes + " attempts on region: " +
1386                     Bytes.toStringBinary(getRegionInfo().getRegionName()));
1387               }
1388               LOG.info("Running extra flush, " + actualFlushes +
1389                 " (carrying snapshot?) " + this);
1390             }
1391             internalFlushcache(status);
1392           } catch (IOException ioe) {
1393             status.setStatus("Failed flush " + this + ", putting online again");
1394             synchronized (writestate) {
1395               writestate.writesEnabled = true;
1396             }
1397             // Have to throw to upper layers.  I can't abort server from here.
1398             throw ioe;
1399           }
1400         }
1401       }
1402 
1403       Map<byte[], List<StoreFile>> result =
1404         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1405       if (!stores.isEmpty()) {
1406         // initialize the thread pool for closing stores in parallel.
1407         ThreadPoolExecutor storeCloserThreadPool =
1408           getStoreOpenAndCloseThreadPool("StoreCloserThread-" +
1409             getRegionInfo().getRegionNameAsString());
1410         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1411           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1412 
1413         // close each store in parallel
1414         for (final Store store : stores.values()) {
1415           assert abort || store.getFlushableSize() == 0 || writestate.readOnly;
1416           completionService
1417               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1418                 @Override
1419                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1420                   return new Pair<byte[], Collection<StoreFile>>(
1421                     store.getFamily().getName(), store.close());
1422                 }
1423               });
1424         }
1425         try {
1426           for (int i = 0; i < stores.size(); i++) {
1427             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1428             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1429             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1430             if (familyFiles == null) {
1431               familyFiles = new ArrayList<StoreFile>();
1432               result.put(storeFiles.getFirst(), familyFiles);
1433             }
1434             familyFiles.addAll(storeFiles.getSecond());
1435           }
1436         } catch (InterruptedException e) {
1437           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1438         } catch (ExecutionException e) {
1439           throw new IOException(e.getCause());
1440         } finally {
1441           storeCloserThreadPool.shutdownNow();
1442         }
1443       }
1444 
1445       status.setStatus("Writing region close event to WAL");
1446       if (!abort && wal != null && getRegionServerServices() != null && !writestate.readOnly) {
1447         writeRegionCloseMarker(wal);
1448       }
1449 
1450       this.closed.set(true);
1451       if (!canFlush) {
1452         addAndGetGlobalMemstoreSize(-memstoreSize.get());
1453       } else if (memstoreSize.get() != 0) {
1454         LOG.error("Memstore size is " + memstoreSize.get());
1455       }
1456       if (coprocessorHost != null) {
1457         status.setStatus("Running coprocessor post-close hooks");
1458         this.coprocessorHost.postClose(abort);
1459       }
1460       if (this.metricsRegion != null) {
1461         this.metricsRegion.close();
1462       }
1463       if (this.metricsRegionWrapper != null) {
1464         Closeables.closeQuietly(this.metricsRegionWrapper);
1465       }
1466       status.markComplete("Closed");
1467       LOG.info("Closed " + this);
1468       return result;
1469     } finally {
1470       lock.writeLock().unlock();
1471     }
1472   }
1473 
1474   @Override
1475   public void waitForFlushesAndCompactions() {
1476     synchronized (writestate) {
1477       if (this.writestate.readOnly) {
1478         // we should not wait for replayed flushed if we are read only (for example in case the
1479         // region is a secondary replica).
1480         return;
1481       }
1482       boolean interrupted = false;
1483       try {
1484         while (writestate.compacting > 0 || writestate.flushing) {
1485           LOG.debug("waiting for " + writestate.compacting + " compactions"
1486             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1487           try {
1488             writestate.wait();
1489           } catch (InterruptedException iex) {
1490             // essentially ignore and propagate the interrupt back up
1491             LOG.warn("Interrupted while waiting");
1492             interrupted = true;
1493           }
1494         }
1495       } finally {
1496         if (interrupted) {
1497           Thread.currentThread().interrupt();
1498         }
1499       }
1500     }
1501   }
1502 
1503   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1504       final String threadNamePrefix) {
1505     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1506     int maxThreads = Math.min(numStores,
1507         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1508             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1509     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1510   }
1511 
1512   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1513       final String threadNamePrefix) {
1514     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1515     int maxThreads = Math.max(1,
1516         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1517             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1518             / numStores);
1519     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1520   }
1521 
1522   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1523       final String threadNamePrefix) {
1524     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1525       new ThreadFactory() {
1526         private int count = 1;
1527 
1528         @Override
1529         public Thread newThread(Runnable r) {
1530           return new Thread(r, threadNamePrefix + "-" + count++);
1531         }
1532       });
1533   }
1534 
1535    /**
1536     * @return True if its worth doing a flush before we put up the close flag.
1537     */
1538   private boolean worthPreFlushing() {
1539     return this.memstoreSize.get() >
1540       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1541   }
1542 
1543   //////////////////////////////////////////////////////////////////////////////
1544   // HRegion accessors
1545   //////////////////////////////////////////////////////////////////////////////
1546 
1547   @Override
1548   public HTableDescriptor getTableDesc() {
1549     return this.htableDescriptor;
1550   }
1551 
1552   /** @return WAL in use for this region */
1553   public WAL getWAL() {
1554     return this.wal;
1555   }
1556 
1557   /**
1558    * @return split policy for this region.
1559    */
1560   public RegionSplitPolicy getSplitPolicy() {
1561     return this.splitPolicy;
1562   }
1563 
1564   /**
1565    * A split takes the config from the parent region & passes it to the daughter
1566    * region's constructor. If 'conf' was passed, you would end up using the HTD
1567    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1568    * to the daughter regions to avoid this tricky dedupe problem.
1569    * @return Configuration object
1570    */
1571   Configuration getBaseConf() {
1572     return this.baseConf;
1573   }
1574 
1575   /** @return {@link FileSystem} being used by this region */
1576   public FileSystem getFilesystem() {
1577     return fs.getFileSystem();
1578   }
1579 
1580   /** @return the {@link HRegionFileSystem} used by this region */
1581   public HRegionFileSystem getRegionFileSystem() {
1582     return this.fs;
1583   }
1584 
1585   @Override
1586   public long getEarliestFlushTimeForAllStores() {
1587     return lastStoreFlushTimeMap.isEmpty() ? Long.MAX_VALUE : Collections.min(lastStoreFlushTimeMap
1588         .values());
1589   }
1590 
1591   @Override
1592   public long getOldestHfileTs(boolean majorCompactioOnly) throws IOException {
1593     long result = Long.MAX_VALUE;
1594     for (Store store : getStores()) {
1595       for (StoreFile file : store.getStorefiles()) {
1596         HFile.Reader reader = file.getReader().getHFileReader();
1597         if (majorCompactioOnly) {
1598           byte[] val = reader.loadFileInfo().get(StoreFile.MAJOR_COMPACTION_KEY);
1599           if (val == null || !Bytes.toBoolean(val)) {
1600             continue;
1601           }
1602         }
1603         result = Math.min(result, reader.getFileContext().getFileCreateTime());
1604       }
1605     }
1606     return result == Long.MAX_VALUE ? 0 : result;
1607   }
1608 
1609   RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
1610     long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
1611     byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
1612     regionLoadBldr.clearStoreCompleteSequenceId();
1613     for (byte[] familyName : this.stores.keySet()) {
1614       long oldestUnflushedSeqId = this.wal.getEarliestMemstoreSeqNum(encodedRegionName, familyName);
1615       // no oldestUnflushedSeqId means no data has written to the store after last flush, so we use
1616       // lastFlushOpSeqId as complete sequence id for the store.
1617       regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId
1618           .newBuilder()
1619           .setFamilyName(ByteString.copyFrom(familyName))
1620           .setSequenceId(
1621             oldestUnflushedSeqId < 0 ? lastFlushOpSeqIdLocal : oldestUnflushedSeqId - 1).build());
1622     }
1623     return regionLoadBldr.setCompleteSequenceId(this.maxFlushedSeqId);
1624   }
1625 
1626   //////////////////////////////////////////////////////////////////////////////
1627   // HRegion maintenance.
1628   //
1629   // These methods are meant to be called periodically by the HRegionServer for
1630   // upkeep.
1631   //////////////////////////////////////////////////////////////////////////////
1632 
1633   /** @return returns size of largest HStore. */
1634   public long getLargestHStoreSize() {
1635     long size = 0;
1636     for (Store h : stores.values()) {
1637       long storeSize = h.getSize();
1638       if (storeSize > size) {
1639         size = storeSize;
1640       }
1641     }
1642     return size;
1643   }
1644 
1645   /*
1646    * Do preparation for pending compaction.
1647    * @throws IOException
1648    */
1649   protected void doRegionCompactionPrep() throws IOException {
1650   }
1651 
1652   @Override
1653   public void triggerMajorCompaction() throws IOException {
1654     for (Store s : getStores()) {
1655       s.triggerMajorCompaction();
1656     }
1657   }
1658 
1659   @Override
1660   public void compact(final boolean majorCompaction) throws IOException {
1661     if (majorCompaction) {
1662       triggerMajorCompaction();
1663     }
1664     for (Store s : getStores()) {
1665       CompactionContext compaction = s.requestCompaction();
1666       if (compaction != null) {
1667         CompactionThroughputController controller = null;
1668         if (rsServices != null) {
1669           controller = CompactionThroughputControllerFactory.create(rsServices, conf);
1670         }
1671         if (controller == null) {
1672           controller = NoLimitCompactionThroughputController.INSTANCE;
1673         }
1674         compact(compaction, s, controller);
1675       }
1676     }
1677   }
1678 
1679   /**
1680    * This is a helper function that compact all the stores synchronously
1681    * It is used by utilities and testing
1682    *
1683    * @throws IOException e
1684    */
1685   public void compactStores() throws IOException {
1686     for (Store s : getStores()) {
1687       CompactionContext compaction = s.requestCompaction();
1688       if (compaction != null) {
1689         compact(compaction, s, NoLimitCompactionThroughputController.INSTANCE);
1690       }
1691     }
1692   }
1693 
1694   /**
1695    * This is a helper function that compact the given store
1696    * It is used by utilities and testing
1697    *
1698    * @throws IOException e
1699    */
1700   @VisibleForTesting
1701   void compactStore(byte[] family, CompactionThroughputController throughputController)
1702       throws IOException {
1703     Store s = getStore(family);
1704     CompactionContext compaction = s.requestCompaction();
1705     if (compaction != null) {
1706       compact(compaction, s, throughputController);
1707     }
1708   }
1709 
1710   /*
1711    * Called by compaction thread and after region is opened to compact the
1712    * HStores if necessary.
1713    *
1714    * <p>This operation could block for a long time, so don't call it from a
1715    * time-sensitive thread.
1716    *
1717    * Note that no locking is necessary at this level because compaction only
1718    * conflicts with a region split, and that cannot happen because the region
1719    * server does them sequentially and not in parallel.
1720    *
1721    * @param compaction Compaction details, obtained by requestCompaction()
1722    * @return whether the compaction completed
1723    */
1724   public boolean compact(CompactionContext compaction, Store store,
1725       CompactionThroughputController throughputController) throws IOException {
1726     assert compaction != null && compaction.hasSelection();
1727     assert !compaction.getRequest().getFiles().isEmpty();
1728     if (this.closing.get() || this.closed.get()) {
1729       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1730       store.cancelRequestedCompaction(compaction);
1731       return false;
1732     }
1733     MonitoredTask status = null;
1734     boolean requestNeedsCancellation = true;
1735     // block waiting for the lock for compaction
1736     lock.readLock().lock();
1737     try {
1738       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1739       if (stores.get(cf) != store) {
1740         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1741             + " has been re-instantiated, cancel this compaction request. "
1742             + " It may be caused by the roll back of split transaction");
1743         return false;
1744       }
1745 
1746       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1747       if (this.closed.get()) {
1748         String msg = "Skipping compaction on " + this + " because closed";
1749         LOG.debug(msg);
1750         status.abort(msg);
1751         return false;
1752       }
1753       boolean wasStateSet = false;
1754       try {
1755         synchronized (writestate) {
1756           if (writestate.writesEnabled) {
1757             wasStateSet = true;
1758             ++writestate.compacting;
1759           } else {
1760             String msg = "NOT compacting region " + this + ". Writes disabled.";
1761             LOG.info(msg);
1762             status.abort(msg);
1763             return false;
1764           }
1765         }
1766         LOG.info("Starting compaction on " + store + " in region " + this
1767             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1768         doRegionCompactionPrep();
1769         try {
1770           status.setStatus("Compacting store " + store);
1771           // We no longer need to cancel the request on the way out of this
1772           // method because Store#compact will clean up unconditionally
1773           requestNeedsCancellation = false;
1774           store.compact(compaction, throughputController);
1775         } catch (InterruptedIOException iioe) {
1776           String msg = "compaction interrupted";
1777           LOG.info(msg, iioe);
1778           status.abort(msg);
1779           return false;
1780         }
1781       } finally {
1782         if (wasStateSet) {
1783           synchronized (writestate) {
1784             --writestate.compacting;
1785             if (writestate.compacting <= 0) {
1786               writestate.notifyAll();
1787             }
1788           }
1789         }
1790       }
1791       status.markComplete("Compaction complete");
1792       return true;
1793     } finally {
1794       try {
1795         if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
1796         if (status != null) status.cleanup();
1797       } finally {
1798         lock.readLock().unlock();
1799       }
1800     }
1801   }
1802 
1803   @Override
1804   public FlushResult flush(boolean force) throws IOException {
1805     return flushcache(force, false);
1806   }
1807 
1808   /**
1809    * Flush the cache.
1810    *
1811    * When this method is called the cache will be flushed unless:
1812    * <ol>
1813    *   <li>the cache is empty</li>
1814    *   <li>the region is closed.</li>
1815    *   <li>a flush is already in progress</li>
1816    *   <li>writes are disabled</li>
1817    * </ol>
1818    *
1819    * <p>This method may block for some time, so it should not be called from a
1820    * time-sensitive thread.
1821    * @param forceFlushAllStores whether we want to flush all stores
1822    * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
1823    * @return whether the flush is success and whether the region needs compacting
1824    *
1825    * @throws IOException general io exceptions
1826    * @throws DroppedSnapshotException Thrown when replay of wal is required
1827    * because a Snapshot was not properly persisted.
1828    */
1829   public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
1830       throws IOException {
1831     // fail-fast instead of waiting on the lock
1832     if (this.closing.get()) {
1833       String msg = "Skipping flush on " + this + " because closing";
1834       LOG.debug(msg);
1835       return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
1836     }
1837     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
1838     status.setStatus("Acquiring readlock on region");
1839     // block waiting for the lock for flushing cache
1840     lock.readLock().lock();
1841     try {
1842       if (this.closed.get()) {
1843         String msg = "Skipping flush on " + this + " because closed";
1844         LOG.debug(msg);
1845         status.abort(msg);
1846         return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
1847       }
1848       if (coprocessorHost != null) {
1849         status.setStatus("Running coprocessor pre-flush hooks");
1850         coprocessorHost.preFlush();
1851       }
1852       // TODO: this should be managed within memstore with the snapshot, updated only after flush
1853       // successful
1854       if (numMutationsWithoutWAL.get() > 0) {
1855         numMutationsWithoutWAL.set(0);
1856         dataInMemoryWithoutWAL.set(0);
1857       }
1858       synchronized (writestate) {
1859         if (!writestate.flushing && writestate.writesEnabled) {
1860           this.writestate.flushing = true;
1861         } else {
1862           if (LOG.isDebugEnabled()) {
1863             LOG.debug("NOT flushing memstore for region " + this
1864                 + ", flushing=" + writestate.flushing + ", writesEnabled="
1865                 + writestate.writesEnabled);
1866           }
1867           String msg = "Not flushing since "
1868               + (writestate.flushing ? "already flushing"
1869               : "writes not enabled");
1870           status.abort(msg);
1871           return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
1872         }
1873       }
1874 
1875       try {
1876         Collection<Store> specificStoresToFlush =
1877             forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
1878         FlushResult fs = internalFlushcache(specificStoresToFlush,
1879           status, writeFlushRequestWalMarker);
1880 
1881         if (coprocessorHost != null) {
1882           status.setStatus("Running post-flush coprocessor hooks");
1883           coprocessorHost.postFlush();
1884         }
1885 
1886         status.markComplete("Flush successful");
1887         return fs;
1888       } finally {
1889         synchronized (writestate) {
1890           writestate.flushing = false;
1891           this.writestate.flushRequested = false;
1892           writestate.notifyAll();
1893         }
1894       }
1895     } finally {
1896       lock.readLock().unlock();
1897       status.cleanup();
1898     }
1899   }
1900 
1901   /**
1902    * Should the store be flushed because it is old enough.
1903    * <p>
1904    * Every FlushPolicy should call this to determine whether a store is old enough to flush(except
1905    * that you always flush all stores). Otherwise the {@link #shouldFlush()} method will always
1906    * returns true which will make a lot of flush requests.
1907    */
1908   boolean shouldFlushStore(Store store) {
1909     long maxFlushedSeqId =
1910         this.wal.getEarliestMemstoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), store
1911             .getFamily().getName()) - 1;
1912     if (maxFlushedSeqId > 0 && maxFlushedSeqId + flushPerChanges < sequenceId.get()) {
1913       if (LOG.isDebugEnabled()) {
1914         LOG.debug("Column Family: " + store.getColumnFamilyName() + " of region " + this
1915             + " will be flushed because its max flushed seqId(" + maxFlushedSeqId
1916             + ") is far away from current(" + sequenceId.get() + "), max allowed is "
1917             + flushPerChanges);
1918       }
1919       return true;
1920     }
1921     if (flushCheckInterval <= 0) {
1922       return false;
1923     }
1924     long now = EnvironmentEdgeManager.currentTime();
1925     if (store.timeOfOldestEdit() < now - flushCheckInterval) {
1926       if (LOG.isDebugEnabled()) {
1927         LOG.debug("Column Family: " + store.getColumnFamilyName() + " of region " + this
1928             + " will be flushed because time of its oldest edit (" + store.timeOfOldestEdit()
1929             + ") is far away from now(" + now + "), max allowed is " + flushCheckInterval);
1930       }
1931       return true;
1932     }
1933     return false;
1934   }
1935 
1936   /**
1937    * Should the memstore be flushed now
1938    */
1939   boolean shouldFlush() {
1940     // This is a rough measure.
1941     if (this.maxFlushedSeqId > 0
1942           && (this.maxFlushedSeqId + this.flushPerChanges < this.sequenceId.get())) {
1943       return true;
1944     }
1945     long modifiedFlushCheckInterval = flushCheckInterval;
1946     if (getRegionInfo().isMetaRegion() &&
1947         getRegionInfo().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
1948       modifiedFlushCheckInterval = META_CACHE_FLUSH_INTERVAL;
1949     }
1950     if (modifiedFlushCheckInterval <= 0) { //disabled
1951       return false;
1952     }
1953     long now = EnvironmentEdgeManager.currentTime();
1954     //if we flushed in the recent past, we don't need to do again now
1955     if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
1956       return false;
1957     }
1958     //since we didn't flush in the recent past, flush now if certain conditions
1959     //are met. Return true on first such memstore hit.
1960     for (Store s : getStores()) {
1961       if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
1962         // we have an old enough edit in the memstore, flush
1963         return true;
1964       }
1965     }
1966     return false;
1967   }
1968 
1969   /**
1970    * Flushing all stores.
1971    *
1972    * @see #internalFlushcache(Collection, MonitoredTask, boolean)
1973    */
1974   private FlushResult internalFlushcache(MonitoredTask status)
1975       throws IOException {
1976     return internalFlushcache(stores.values(), status, false);
1977   }
1978 
1979   /**
1980    * Flushing given stores.
1981    *
1982    * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean)
1983    */
1984   private FlushResult internalFlushcache(final Collection<Store> storesToFlush,
1985       MonitoredTask status, boolean writeFlushWalMarker) throws IOException {
1986     return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush,
1987         status, writeFlushWalMarker);
1988   }
1989 
1990   /**
1991    * Flush the memstore. Flushing the memstore is a little tricky. We have a lot
1992    * of updates in the memstore, all of which have also been written to the wal.
1993    * We need to write those updates in the memstore out to disk, while being
1994    * able to process reads/writes as much as possible during the flush
1995    * operation.
1996    * <p>
1997    * This method may block for some time. Every time you call it, we up the
1998    * regions sequence id even if we don't flush; i.e. the returned region id
1999    * will be at least one larger than the last edit applied to this region. The
2000    * returned id does not refer to an actual edit. The returned id can be used
2001    * for say installing a bulk loaded file just ahead of the last hfile that was
2002    * the result of this flush, etc.
2003    *
2004    * @param wal
2005    *          Null if we're NOT to go via wal.
2006    * @param myseqid
2007    *          The seqid to use if <code>wal</code> is null writing out flush
2008    *          file.
2009    * @param storesToFlush
2010    *          The list of stores to flush.
2011    * @return object describing the flush's state
2012    * @throws IOException
2013    *           general io exceptions
2014    * @throws DroppedSnapshotException
2015    *           Thrown when replay of wal is required because a Snapshot was not
2016    *           properly persisted.
2017    */
2018   protected FlushResult internalFlushcache(final WAL wal, final long myseqid,
2019       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2020           throws IOException {
2021     PrepareFlushResult result
2022       = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker);
2023     if (result.result == null) {
2024       return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
2025     } else {
2026       return result.result; // early exit due to failure from prepare stage
2027     }
2028   }
2029 
2030   protected PrepareFlushResult internalPrepareFlushCache(
2031       final WAL wal, final long myseqid, final Collection<Store> storesToFlush,
2032       MonitoredTask status, boolean writeFlushWalMarker)
2033           throws IOException {
2034 
2035     if (this.rsServices != null && this.rsServices.isAborted()) {
2036       // Don't flush when server aborting, it's unsafe
2037       throw new IOException("Aborting flush because server is aborted...");
2038     }
2039     final long startTime = EnvironmentEdgeManager.currentTime();
2040     // If nothing to flush, return, but we need to safely update the region sequence id
2041     if (this.memstoreSize.get() <= 0) {
2042       // Take an update lock because am about to change the sequence id and we want the sequence id
2043       // to be at the border of the empty memstore.
2044       MultiVersionConsistencyControl.WriteEntry w = null;
2045       this.updatesLock.writeLock().lock();
2046       try {
2047         if (this.memstoreSize.get() <= 0) {
2048           // Presume that if there are still no edits in the memstore, then there are no edits for
2049           // this region out in the WAL subsystem so no need to do any trickery clearing out
2050           // edits in the WAL system. Up the sequence number so the resulting flush id is for
2051           // sure just beyond the last appended region edit (useful as a marker when bulk loading,
2052           // etc.)
2053           // wal can be null replaying edits.
2054           if (wal != null) {
2055             w = mvcc.beginMemstoreInsert();
2056             long flushOpSeqId = getNextSequenceId(wal);
2057             FlushResult flushResult = new FlushResultImpl(
2058               FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, "Nothing to flush",
2059               writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
2060             w.setWriteNumber(flushOpSeqId);
2061             mvcc.waitForPreviousTransactionsComplete(w);
2062             w = null;
2063             return new PrepareFlushResult(flushResult, myseqid);
2064           } else {
2065             return new PrepareFlushResult(
2066               new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
2067                 "Nothing to flush", false),
2068               myseqid);
2069           }
2070         }
2071       } finally {
2072         this.updatesLock.writeLock().unlock();
2073         if (w != null) {
2074           mvcc.advanceMemstore(w);
2075         }
2076       }
2077     }
2078 
2079     if (LOG.isInfoEnabled()) {
2080       LOG.info("Started memstore flush for " + this + ", current region memstore size "
2081           + StringUtils.byteDesc(this.memstoreSize.get()) + ", and " + storesToFlush.size() + "/"
2082           + stores.size() + " column families' memstores are being flushed."
2083           + ((wal != null) ? "" : "; wal is null, using passed sequenceid=" + myseqid));
2084       // only log when we are not flushing all stores.
2085       if (this.stores.size() > storesToFlush.size()) {
2086         for (Store store: storesToFlush) {
2087           LOG.info("Flushing Column Family: " + store.getColumnFamilyName()
2088               + " which was occupying "
2089               + StringUtils.byteDesc(store.getMemStoreSize()) + " of memstore.");
2090         }
2091       }
2092     }
2093     // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
2094     // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
2095     // allow updates again so its value will represent the size of the updates received
2096     // during flush
2097     MultiVersionConsistencyControl.WriteEntry w = null;
2098     // We have to take an update lock during snapshot, or else a write could end up in both snapshot
2099     // and memstore (makes it difficult to do atomic rows then)
2100     status.setStatus("Obtaining lock to block concurrent updates");
2101     // block waiting for the lock for internal flush
2102     this.updatesLock.writeLock().lock();
2103     status.setStatus("Preparing to flush by snapshotting stores in " +
2104       getRegionInfo().getEncodedName());
2105     long totalFlushableSizeOfFlushableStores = 0;
2106 
2107     Set<byte[]> flushedFamilyNames = new HashSet<byte[]>();
2108     for (Store store: storesToFlush) {
2109       flushedFamilyNames.add(store.getFamily().getName());
2110     }
2111 
2112     TreeMap<byte[], StoreFlushContext> storeFlushCtxs
2113       = new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
2114     TreeMap<byte[], List<Path>> committedFiles = new TreeMap<byte[], List<Path>>(
2115         Bytes.BYTES_COMPARATOR);
2116     // The sequence id of this flush operation which is used to log FlushMarker and pass to
2117     // createFlushContext to use as the store file's sequence id.
2118     long flushOpSeqId = HConstants.NO_SEQNUM;
2119     // The max flushed sequence id after this flush operation. Used as completeSequenceId which is
2120     // passed to HMaster.
2121     long flushedSeqId = HConstants.NO_SEQNUM;
2122     byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
2123 
2124     long trxId = 0;
2125     try {
2126       try {
2127         w = mvcc.beginMemstoreInsert();
2128         if (wal != null) {
2129           if (!wal.startCacheFlush(encodedRegionName, flushedFamilyNames)) {
2130             // This should never happen.
2131             String msg = "Flush will not be started for ["
2132                 + this.getRegionInfo().getEncodedName() + "] - because the WAL is closing.";
2133             status.setStatus(msg);
2134             return new PrepareFlushResult(
2135               new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
2136               myseqid);
2137           }
2138           flushOpSeqId = getNextSequenceId(wal);
2139           long oldestUnflushedSeqId = wal.getEarliestMemstoreSeqNum(encodedRegionName);
2140           // no oldestUnflushedSeqId means we flushed all stores.
2141           // or the unflushed stores are all empty.
2142           flushedSeqId = (oldestUnflushedSeqId == HConstants.NO_SEQNUM) ? flushOpSeqId
2143               : oldestUnflushedSeqId - 1;
2144         } else {
2145           // use the provided sequence Id as WAL is not being used for this flush.
2146           flushedSeqId = flushOpSeqId = myseqid;
2147         }
2148 
2149         for (Store s : storesToFlush) {
2150           totalFlushableSizeOfFlushableStores += s.getFlushableSize();
2151           storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId));
2152           committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL
2153         }
2154 
2155         // write the snapshot start to WAL
2156         if (wal != null && !writestate.readOnly) {
2157           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
2158             getRegionInfo(), flushOpSeqId, committedFiles);
2159           // no sync. Sync is below where we do not hold the updates lock
2160           trxId = WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2161             desc, sequenceId, false);
2162         }
2163 
2164         // Prepare flush (take a snapshot)
2165         for (StoreFlushContext flush : storeFlushCtxs.values()) {
2166           flush.prepare();
2167         }
2168       } catch (IOException ex) {
2169         if (wal != null) {
2170           if (trxId > 0) { // check whether we have already written START_FLUSH to WAL
2171             try {
2172               FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2173                 getRegionInfo(), flushOpSeqId, committedFiles);
2174               WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2175                 desc, sequenceId, false);
2176             } catch (Throwable t) {
2177               LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
2178                   StringUtils.stringifyException(t));
2179               // ignore this since we will be aborting the RS with DSE.
2180             }
2181           }
2182           // we have called wal.startCacheFlush(), now we have to abort it
2183           wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2184           throw ex; // let upper layers deal with it.
2185         }
2186       } finally {
2187         this.updatesLock.writeLock().unlock();
2188       }
2189       String s = "Finished memstore snapshotting " + this +
2190         ", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSizeOfFlushableStores;
2191       status.setStatus(s);
2192       if (LOG.isTraceEnabled()) LOG.trace(s);
2193       // sync unflushed WAL changes
2194       // see HBASE-8208 for details
2195       if (wal != null) {
2196         try {
2197           wal.sync(); // ensure that flush marker is sync'ed
2198         } catch (IOException ioe) {
2199           LOG.warn("Unexpected exception while wal.sync(), ignoring. Exception: "
2200               + StringUtils.stringifyException(ioe));
2201         }
2202       }
2203 
2204       // wait for all in-progress transactions to commit to WAL before
2205       // we can start the flush. This prevents
2206       // uncommitted transactions from being written into HFiles.
2207       // We have to block before we start the flush, otherwise keys that
2208       // were removed via a rollbackMemstore could be written to Hfiles.
2209       w.setWriteNumber(flushOpSeqId);
2210       mvcc.waitForPreviousTransactionsComplete(w);
2211       // set w to null to prevent mvcc.advanceMemstore from being called again inside finally block
2212       w = null;
2213     } finally {
2214       if (w != null) {
2215         // in case of failure just mark current w as complete
2216         mvcc.advanceMemstore(w);
2217       }
2218     }
2219     return new PrepareFlushResult(storeFlushCtxs, committedFiles, startTime, flushOpSeqId,
2220       flushedSeqId, totalFlushableSizeOfFlushableStores);
2221   }
2222 
2223   /**
2224    * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
2225    * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
2226    * @param wal
2227    * @return whether WAL write was successful
2228    */
2229   private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
2230     if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
2231       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
2232         getRegionInfo(), -1, new TreeMap<byte[], List<Path>>());
2233       try {
2234         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2235           desc, sequenceId, true);
2236         return true;
2237       } catch (IOException e) {
2238         LOG.warn(getRegionInfo().getEncodedName() + " : "
2239             + "Received exception while trying to write the flush request to wal", e);
2240       }
2241     }
2242     return false;
2243   }
2244 
2245   protected FlushResult internalFlushCacheAndCommit(
2246         final WAL wal, MonitoredTask status, final PrepareFlushResult prepareResult,
2247         final Collection<Store> storesToFlush)
2248     throws IOException {
2249 
2250     // prepare flush context is carried via PrepareFlushResult
2251     TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
2252     TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
2253     long startTime = prepareResult.startTime;
2254     long flushOpSeqId = prepareResult.flushOpSeqId;
2255     long flushedSeqId = prepareResult.flushedSeqId;
2256     long totalFlushableSizeOfFlushableStores = prepareResult.totalFlushableSize;
2257 
2258     String s = "Flushing stores of " + this;
2259     status.setStatus(s);
2260     if (LOG.isTraceEnabled()) LOG.trace(s);
2261 
2262     // Any failure from here on out will be catastrophic requiring server
2263     // restart so wal content can be replayed and put back into the memstore.
2264     // Otherwise, the snapshot content while backed up in the wal, it will not
2265     // be part of the current running servers state.
2266     boolean compactionRequested = false;
2267     try {
2268       // A.  Flush memstore to all the HStores.
2269       // Keep running vector of all store files that includes both old and the
2270       // just-made new flush store file. The new flushed file is still in the
2271       // tmp directory.
2272 
2273       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2274         flush.flushCache(status);
2275       }
2276 
2277       // Switch snapshot (in memstore) -> new hfile (thus causing
2278       // all the store scanners to reset/reseek).
2279       Iterator<Store> it = storesToFlush.iterator();
2280       // stores.values() and storeFlushCtxs have same order
2281       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2282         boolean needsCompaction = flush.commit(status);
2283         if (needsCompaction) {
2284           compactionRequested = true;
2285         }
2286         committedFiles.put(it.next().getFamily().getName(), flush.getCommittedFiles());
2287       }
2288       storeFlushCtxs.clear();
2289 
2290       // Set down the memstore size by amount of flush.
2291       this.addAndGetGlobalMemstoreSize(-totalFlushableSizeOfFlushableStores);
2292 
2293       if (wal != null) {
2294         // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
2295         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
2296           getRegionInfo(), flushOpSeqId, committedFiles);
2297         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2298           desc, sequenceId, true);
2299       }
2300     } catch (Throwable t) {
2301       // An exception here means that the snapshot was not persisted.
2302       // The wal needs to be replayed so its content is restored to memstore.
2303       // Currently, only a server restart will do this.
2304       // We used to only catch IOEs but its possible that we'd get other
2305       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
2306       // all and sundry.
2307       if (wal != null) {
2308         try {
2309           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2310             getRegionInfo(), flushOpSeqId, committedFiles);
2311           WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2312             desc, sequenceId, false);
2313         } catch (Throwable ex) {
2314           LOG.warn(getRegionInfo().getEncodedName() + " : "
2315               + "Received unexpected exception trying to write ABORT_FLUSH marker to WAL:"
2316               + StringUtils.stringifyException(ex));
2317           // ignore this since we will be aborting the RS with DSE.
2318         }
2319         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2320       }
2321       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
2322           Bytes.toStringBinary(getRegionInfo().getRegionName()));
2323       dse.initCause(t);
2324       status.abort("Flush failed: " + StringUtils.stringifyException(t));
2325       throw dse;
2326     }
2327 
2328     // If we get to here, the HStores have been written.
2329     if (wal != null) {
2330       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2331     }
2332 
2333     // Record latest flush time
2334     for (Store store: storesToFlush) {
2335       this.lastStoreFlushTimeMap.put(store, startTime);
2336     }
2337 
2338     // Update the oldest unflushed sequence id for region.
2339     this.maxFlushedSeqId = flushedSeqId;
2340 
2341     // Record flush operation sequence id.
2342     this.lastFlushOpSeqId = flushOpSeqId;
2343 
2344     // C. Finally notify anyone waiting on memstore to clear:
2345     // e.g. checkResources().
2346     synchronized (this) {
2347       notifyAll(); // FindBugs NN_NAKED_NOTIFY
2348     }
2349 
2350     long time = EnvironmentEdgeManager.currentTime() - startTime;
2351     long memstoresize = this.memstoreSize.get();
2352     String msg = "Finished memstore flush of ~"
2353         + StringUtils.byteDesc(totalFlushableSizeOfFlushableStores) + "/"
2354         + totalFlushableSizeOfFlushableStores + ", currentsize="
2355         + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
2356         + " for region " + this + " in " + time + "ms, sequenceid="
2357         + flushOpSeqId +  ", compaction requested=" + compactionRequested
2358         + ((wal == null) ? "; wal=null" : "");
2359     LOG.info(msg);
2360     status.setStatus(msg);
2361 
2362     return new FlushResultImpl(compactionRequested ? 
2363         FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
2364           FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED,
2365         flushOpSeqId);
2366   }
2367 
2368   /**
2369    * Method to safely get the next sequence number.
2370    * @return Next sequence number unassociated with any actual edit.
2371    * @throws IOException
2372    */
2373   @VisibleForTesting
2374   protected long getNextSequenceId(final WAL wal) throws IOException {
2375     WALKey key = this.appendEmptyEdit(wal, null);
2376     return key.getSequenceId();
2377   }
2378 
2379   //////////////////////////////////////////////////////////////////////////////
2380   // get() methods for client use.
2381   //////////////////////////////////////////////////////////////////////////////
2382 
2383   @Override
2384   public Result getClosestRowBefore(final byte [] row, final byte [] family) throws IOException {
2385     if (coprocessorHost != null) {
2386       Result result = new Result();
2387       if (coprocessorHost.preGetClosestRowBefore(row, family, result)) {
2388         return result;
2389       }
2390     }
2391     // look across all the HStores for this region and determine what the
2392     // closest key is across all column families, since the data may be sparse
2393     checkRow(row, "getClosestRowBefore");
2394     startRegionOperation(Operation.GET);
2395     this.readRequestsCount.increment();
2396     try {
2397       Store store = getStore(family);
2398       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
2399       Cell key = store.getRowKeyAtOrBefore(row);
2400       Result result = null;
2401       if (key != null) {
2402         Get get = new Get(CellUtil.cloneRow(key));
2403         get.addFamily(family);
2404         result = get(get);
2405       }
2406       if (coprocessorHost != null) {
2407         coprocessorHost.postGetClosestRowBefore(row, family, result);
2408       }
2409       return result;
2410     } finally {
2411       closeRegionOperation(Operation.GET);
2412     }
2413   }
2414 
2415   @Override
2416   public RegionScanner getScanner(Scan scan) throws IOException {
2417    return getScanner(scan, null);
2418   }
2419 
2420   protected RegionScanner getScanner(Scan scan,
2421       List<KeyValueScanner> additionalScanners) throws IOException {
2422     startRegionOperation(Operation.SCAN);
2423     try {
2424       // Verify families are all valid
2425       if (!scan.hasFamilies()) {
2426         // Adding all families to scanner
2427         for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
2428           scan.addFamily(family);
2429         }
2430       } else {
2431         for (byte [] family : scan.getFamilyMap().keySet()) {
2432           checkFamily(family);
2433         }
2434       }
2435       return instantiateRegionScanner(scan, additionalScanners);
2436     } finally {
2437       closeRegionOperation(Operation.SCAN);
2438     }
2439   }
2440 
2441   protected RegionScanner instantiateRegionScanner(Scan scan,
2442       List<KeyValueScanner> additionalScanners) throws IOException {
2443     if (scan.isReversed()) {
2444       if (scan.getFilter() != null) {
2445         scan.getFilter().setReversed(true);
2446       }
2447       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
2448     }
2449     return new RegionScannerImpl(scan, additionalScanners, this);
2450   }
2451 
2452   @Override
2453   public void prepareDelete(Delete delete) throws IOException {
2454     // Check to see if this is a deleteRow insert
2455     if(delete.getFamilyCellMap().isEmpty()){
2456       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
2457         // Don't eat the timestamp
2458         delete.addFamily(family, delete.getTimeStamp());
2459       }
2460     } else {
2461       for(byte [] family : delete.getFamilyCellMap().keySet()) {
2462         if(family == null) {
2463           throw new NoSuchColumnFamilyException("Empty family is invalid");
2464         }
2465         checkFamily(family);
2466       }
2467     }
2468   }
2469 
2470   @Override
2471   public void delete(Delete delete) throws IOException {
2472     checkReadOnly();
2473     checkResources();
2474     startRegionOperation(Operation.DELETE);
2475     try {
2476       delete.getRow();
2477       // All edits for the given row (across all column families) must happen atomically.
2478       doBatchMutate(delete);
2479     } finally {
2480       closeRegionOperation(Operation.DELETE);
2481     }
2482   }
2483 
2484   /**
2485    * Row needed by below method.
2486    */
2487   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2488 
2489   /**
2490    * This is used only by unit tests. Not required to be a public API.
2491    * @param familyMap map of family to edits for the given family.
2492    * @throws IOException
2493    */
2494   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2495       Durability durability) throws IOException {
2496     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2497     delete.setFamilyCellMap(familyMap);
2498     delete.setDurability(durability);
2499     doBatchMutate(delete);
2500   }
2501 
2502   @Override
2503   public void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2504       byte[] byteNow) throws IOException {
2505     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2506 
2507       byte[] family = e.getKey();
2508       List<Cell> cells = e.getValue();
2509       assert cells instanceof RandomAccess;
2510 
2511       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2512       int listSize = cells.size();
2513       for (int i=0; i < listSize; i++) {
2514         Cell cell = cells.get(i);
2515         //  Check if time is LATEST, change to time of most recent addition if so
2516         //  This is expensive.
2517         if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && CellUtil.isDeleteType(cell)) {
2518           byte[] qual = CellUtil.cloneQualifier(cell);
2519           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2520 
2521           Integer count = kvCount.get(qual);
2522           if (count == null) {
2523             kvCount.put(qual, 1);
2524           } else {
2525             kvCount.put(qual, count + 1);
2526           }
2527           count = kvCount.get(qual);
2528 
2529           Get get = new Get(CellUtil.cloneRow(cell));
2530           get.setMaxVersions(count);
2531           get.addColumn(family, qual);
2532           if (coprocessorHost != null) {
2533             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2534                 byteNow, get)) {
2535               updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2536             }
2537           } else {
2538             updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2539           }
2540         } else {
2541           CellUtil.updateLatestStamp(cell, byteNow, 0);
2542         }
2543       }
2544     }
2545   }
2546 
2547   void updateDeleteLatestVersionTimeStamp(Cell cell, Get get, int count, byte[] byteNow)
2548       throws IOException {
2549     List<Cell> result = get(get, false);
2550 
2551     if (result.size() < count) {
2552       // Nothing to delete
2553       CellUtil.updateLatestStamp(cell, byteNow, 0);
2554       return;
2555     }
2556     if (result.size() > count) {
2557       throw new RuntimeException("Unexpected size: " + result.size());
2558     }
2559     Cell getCell = result.get(count - 1);
2560     CellUtil.setTimestamp(cell, getCell.getTimestamp());
2561   }
2562 
2563   @Override
2564   public void put(Put put) throws IOException {
2565     checkReadOnly();
2566 
2567     // Do a rough check that we have resources to accept a write.  The check is
2568     // 'rough' in that between the resource check and the call to obtain a
2569     // read lock, resources may run out.  For now, the thought is that this
2570     // will be extremely rare; we'll deal with it when it happens.
2571     checkResources();
2572     startRegionOperation(Operation.PUT);
2573     try {
2574       // All edits for the given row (across all column families) must happen atomically.
2575       doBatchMutate(put);
2576     } finally {
2577       closeRegionOperation(Operation.PUT);
2578     }
2579   }
2580 
2581   /**
2582    * Struct-like class that tracks the progress of a batch operation,
2583    * accumulating status codes and tracking the index at which processing
2584    * is proceeding.
2585    */
2586   private abstract static class BatchOperationInProgress<T> {
2587     T[] operations;
2588     int nextIndexToProcess = 0;
2589     OperationStatus[] retCodeDetails;
2590     WALEdit[] walEditsFromCoprocessors;
2591 
2592     public BatchOperationInProgress(T[] operations) {
2593       this.operations = operations;
2594       this.retCodeDetails = new OperationStatus[operations.length];
2595       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2596       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2597     }
2598 
2599     public abstract Mutation getMutation(int index);
2600     public abstract long getNonceGroup(int index);
2601     public abstract long getNonce(int index);
2602     /** This method is potentially expensive and should only be used for non-replay CP path. */
2603     public abstract Mutation[] getMutationsForCoprocs();
2604     public abstract boolean isInReplay();
2605     public abstract long getReplaySequenceId();
2606 
2607     public boolean isDone() {
2608       return nextIndexToProcess == operations.length;
2609     }
2610   }
2611 
2612   private static class MutationBatch extends BatchOperationInProgress<Mutation> {
2613     private long nonceGroup;
2614     private long nonce;
2615     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2616       super(operations);
2617       this.nonceGroup = nonceGroup;
2618       this.nonce = nonce;
2619     }
2620 
2621     @Override
2622     public Mutation getMutation(int index) {
2623       return this.operations[index];
2624     }
2625 
2626     @Override
2627     public long getNonceGroup(int index) {
2628       return nonceGroup;
2629     }
2630 
2631     @Override
2632     public long getNonce(int index) {
2633       return nonce;
2634     }
2635 
2636     @Override
2637     public Mutation[] getMutationsForCoprocs() {
2638       return this.operations;
2639     }
2640 
2641     @Override
2642     public boolean isInReplay() {
2643       return false;
2644     }
2645 
2646     @Override
2647     public long getReplaySequenceId() {
2648       return 0;
2649     }
2650   }
2651 
2652   private static class ReplayBatch extends BatchOperationInProgress<MutationReplay> {
2653     private long replaySeqId = 0;
2654     public ReplayBatch(MutationReplay[] operations, long seqId) {
2655       super(operations);
2656       this.replaySeqId = seqId;
2657     }
2658 
2659     @Override
2660     public Mutation getMutation(int index) {
2661       return this.operations[index].mutation;
2662     }
2663 
2664     @Override
2665     public long getNonceGroup(int index) {
2666       return this.operations[index].nonceGroup;
2667     }
2668 
2669     @Override
2670     public long getNonce(int index) {
2671       return this.operations[index].nonce;
2672     }
2673 
2674     @Override
2675     public Mutation[] getMutationsForCoprocs() {
2676       assert false;
2677       throw new RuntimeException("Should not be called for replay batch");
2678     }
2679 
2680     @Override
2681     public boolean isInReplay() {
2682       return true;
2683     }
2684 
2685     @Override
2686     public long getReplaySequenceId() {
2687       return this.replaySeqId;
2688     }
2689   }
2690 
2691   @Override
2692   public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce)
2693       throws IOException {
2694     // As it stands, this is used for 3 things
2695     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2696     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2697     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2698     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2699   }
2700 
2701   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2702     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2703   }
2704 
2705   @Override
2706   public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
2707       throws IOException {
2708     if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
2709         && replaySeqId < lastReplayedOpenRegionSeqId) {
2710       // if it is a secondary replica we should ignore these entries silently
2711       // since they are coming out of order
2712       if (LOG.isTraceEnabled()) {
2713         LOG.trace(getRegionInfo().getEncodedName() + " : "
2714           + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
2715           + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
2716         for (MutationReplay mut : mutations) {
2717           LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
2718         }
2719       }
2720 
2721       OperationStatus[] statuses = new OperationStatus[mutations.length];
2722       for (int i = 0; i < statuses.length; i++) {
2723         statuses[i] = OperationStatus.SUCCESS;
2724       }
2725       return statuses;
2726     }
2727     return batchMutate(new ReplayBatch(mutations, replaySeqId));
2728   }
2729 
2730   /**
2731    * Perform a batch of mutations.
2732    * It supports only Put and Delete mutations and will ignore other types passed.
2733    * @param batchOp contains the list of mutations
2734    * @return an array of OperationStatus which internally contains the
2735    *         OperationStatusCode and the exceptionMessage if any.
2736    * @throws IOException
2737    */
2738   OperationStatus[] batchMutate(BatchOperationInProgress<?> batchOp) throws IOException {
2739     boolean initialized = false;
2740     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
2741     startRegionOperation(op);
2742     try {
2743       while (!batchOp.isDone()) {
2744         if (!batchOp.isInReplay()) {
2745           checkReadOnly();
2746         }
2747         checkResources();
2748 
2749         if (!initialized) {
2750           this.writeRequestsCount.add(batchOp.operations.length);
2751           if (!batchOp.isInReplay()) {
2752             doPreMutationHook(batchOp);
2753           }
2754           initialized = true;
2755         }
2756         long addedSize = doMiniBatchMutation(batchOp);
2757         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
2758         if (isFlushSize(newSize)) {
2759           requestFlush();
2760         }
2761       }
2762     } finally {
2763       closeRegionOperation(op);
2764     }
2765     return batchOp.retCodeDetails;
2766   }
2767 
2768 
2769   private void doPreMutationHook(BatchOperationInProgress<?> batchOp)
2770       throws IOException {
2771     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2772     WALEdit walEdit = new WALEdit();
2773     if (coprocessorHost != null) {
2774       for (int i = 0 ; i < batchOp.operations.length; i++) {
2775         Mutation m = batchOp.getMutation(i);
2776         if (m instanceof Put) {
2777           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2778             // pre hook says skip this Put
2779             // mark as success and skip in doMiniBatchMutation
2780             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2781           }
2782         } else if (m instanceof Delete) {
2783           Delete curDel = (Delete) m;
2784           if (curDel.getFamilyCellMap().isEmpty()) {
2785             // handle deleting a row case
2786             prepareDelete(curDel);
2787           }
2788           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2789             // pre hook says skip this Delete
2790             // mark as success and skip in doMiniBatchMutation
2791             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2792           }
2793         } else {
2794           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2795           // mark the operation return code as failure so that it will not be considered in
2796           // the doMiniBatchMutation
2797           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2798               "Put/Delete mutations only supported in batchMutate() now");
2799         }
2800         if (!walEdit.isEmpty()) {
2801           batchOp.walEditsFromCoprocessors[i] = walEdit;
2802           walEdit = new WALEdit();
2803         }
2804       }
2805     }
2806   }
2807 
2808   @SuppressWarnings("unchecked")
2809   private long doMiniBatchMutation(BatchOperationInProgress<?> batchOp) throws IOException {
2810     boolean isInReplay = batchOp.isInReplay();
2811     // variable to note if all Put items are for the same CF -- metrics related
2812     boolean putsCfSetConsistent = true;
2813     //The set of columnFamilies first seen for Put.
2814     Set<byte[]> putsCfSet = null;
2815     // variable to note if all Delete items are for the same CF -- metrics related
2816     boolean deletesCfSetConsistent = true;
2817     //The set of columnFamilies first seen for Delete.
2818     Set<byte[]> deletesCfSet = null;
2819 
2820     long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE;
2821     WALEdit walEdit = new WALEdit(isInReplay);
2822     MultiVersionConsistencyControl.WriteEntry w = null;
2823     long txid = 0;
2824     boolean doRollBackMemstore = false;
2825     boolean locked = false;
2826 
2827     /** Keep track of the locks we hold so we can release them in finally clause */
2828     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
2829     // reference family maps directly so coprocessors can mutate them if desired
2830     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
2831     List<Cell> memstoreCells = new ArrayList<Cell>();
2832     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
2833     int firstIndex = batchOp.nextIndexToProcess;
2834     int lastIndexExclusive = firstIndex;
2835     boolean success = false;
2836     int noOfPuts = 0, noOfDeletes = 0;
2837     WALKey walKey = null;
2838     long mvccNum = 0;
2839     try {
2840       // ------------------------------------
2841       // STEP 1. Try to acquire as many locks as we can, and ensure
2842       // we acquire at least one.
2843       // ----------------------------------
2844       int numReadyToWrite = 0;
2845       long now = EnvironmentEdgeManager.currentTime();
2846       while (lastIndexExclusive < batchOp.operations.length) {
2847         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
2848         boolean isPutMutation = mutation instanceof Put;
2849 
2850         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
2851         // store the family map reference to allow for mutations
2852         familyMaps[lastIndexExclusive] = familyMap;
2853 
2854         // skip anything that "ran" already
2855         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
2856             != OperationStatusCode.NOT_RUN) {
2857           lastIndexExclusive++;
2858           continue;
2859         }
2860 
2861         try {
2862           if (isPutMutation) {
2863             // Check the families in the put. If bad, skip this one.
2864             if (isInReplay) {
2865               removeNonExistentColumnFamilyForReplay(familyMap);
2866             } else {
2867               checkFamilies(familyMap.keySet());
2868             }
2869             checkTimestamps(mutation.getFamilyCellMap(), now);
2870           } else {
2871             prepareDelete((Delete) mutation);
2872           }
2873           checkRow(mutation.getRow(), "doMiniBatchMutation");
2874         } catch (NoSuchColumnFamilyException nscf) {
2875           LOG.warn("No such column family in batch mutation", nscf);
2876           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2877               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
2878           lastIndexExclusive++;
2879           continue;
2880         } catch (FailedSanityCheckException fsce) {
2881           LOG.warn("Batch Mutation did not pass sanity check", fsce);
2882           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2883               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
2884           lastIndexExclusive++;
2885           continue;
2886         } catch (WrongRegionException we) {
2887           LOG.warn("Batch mutation had a row that does not belong to this region", we);
2888           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2889               OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
2890           lastIndexExclusive++;
2891           continue;
2892         }
2893 
2894         // If we haven't got any rows in our batch, we should block to
2895         // get the next one.
2896         boolean shouldBlock = numReadyToWrite == 0;
2897         RowLock rowLock = null;
2898         try {
2899           rowLock = getRowLockInternal(mutation.getRow(), shouldBlock);
2900         } catch (IOException ioe) {
2901           LOG.warn("Failed getting lock in batch put, row="
2902             + Bytes.toStringBinary(mutation.getRow()), ioe);
2903         }
2904         if (rowLock == null) {
2905           // We failed to grab another lock
2906           assert !shouldBlock : "Should never fail to get lock when blocking";
2907           break; // stop acquiring more rows for this batch
2908         } else {
2909           acquiredRowLocks.add(rowLock);
2910         }
2911 
2912         lastIndexExclusive++;
2913         numReadyToWrite++;
2914 
2915         if (isPutMutation) {
2916           // If Column Families stay consistent through out all of the
2917           // individual puts then metrics can be reported as a mutliput across
2918           // column families in the first put.
2919           if (putsCfSet == null) {
2920             putsCfSet = mutation.getFamilyCellMap().keySet();
2921           } else {
2922             putsCfSetConsistent = putsCfSetConsistent
2923                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
2924           }
2925         } else {
2926           if (deletesCfSet == null) {
2927             deletesCfSet = mutation.getFamilyCellMap().keySet();
2928           } else {
2929             deletesCfSetConsistent = deletesCfSetConsistent
2930                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
2931           }
2932         }
2933       }
2934 
2935       // we should record the timestamp only after we have acquired the rowLock,
2936       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
2937       now = EnvironmentEdgeManager.currentTime();
2938       byte[] byteNow = Bytes.toBytes(now);
2939 
2940       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
2941       if (numReadyToWrite <= 0) return 0L;
2942 
2943       // We've now grabbed as many mutations off the list as we can
2944 
2945       // ------------------------------------
2946       // STEP 2. Update any LATEST_TIMESTAMP timestamps
2947       // ----------------------------------
2948       for (int i = firstIndex; !isInReplay && i < lastIndexExclusive; i++) {
2949         // skip invalid
2950         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2951             != OperationStatusCode.NOT_RUN) continue;
2952 
2953         Mutation mutation = batchOp.getMutation(i);
2954         if (mutation instanceof Put) {
2955           updateCellTimestamps(familyMaps[i].values(), byteNow);
2956           noOfPuts++;
2957         } else {
2958           prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
2959           noOfDeletes++;
2960         }
2961         rewriteCellTags(familyMaps[i], mutation);
2962       }
2963 
2964       lock(this.updatesLock.readLock(), numReadyToWrite);
2965       locked = true;
2966       if(isInReplay) {
2967         mvccNum = batchOp.getReplaySequenceId();
2968       } else {
2969         mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
2970       }
2971       //
2972       // ------------------------------------
2973       // Acquire the latest mvcc number
2974       // ----------------------------------
2975       w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
2976 
2977       // calling the pre CP hook for batch mutation
2978       if (!isInReplay && coprocessorHost != null) {
2979         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2980           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2981           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2982         if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L;
2983       }
2984 
2985       // ------------------------------------
2986       // STEP 3. Write back to memstore
2987       // Write to memstore. It is ok to write to memstore
2988       // first without updating the WAL because we do not roll
2989       // forward the memstore MVCC. The MVCC will be moved up when
2990       // the complete operation is done. These changes are not yet
2991       // visible to scanners till we update the MVCC. The MVCC is
2992       // moved only when the sync is complete.
2993       // ----------------------------------
2994       long addedSize = 0;
2995       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2996         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2997             != OperationStatusCode.NOT_RUN) {
2998           continue;
2999         }
3000         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
3001         addedSize += applyFamilyMapToMemstore(familyMaps[i], mvccNum, memstoreCells, isInReplay);
3002       }
3003 
3004       // ------------------------------------
3005       // STEP 4. Build WAL edit
3006       // ----------------------------------
3007       Durability durability = Durability.USE_DEFAULT;
3008       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3009         // Skip puts that were determined to be invalid during preprocessing
3010         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3011             != OperationStatusCode.NOT_RUN) {
3012           continue;
3013         }
3014         batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3015 
3016         Mutation m = batchOp.getMutation(i);
3017         Durability tmpDur = getEffectiveDurability(m.getDurability());
3018         if (tmpDur.ordinal() > durability.ordinal()) {
3019           durability = tmpDur;
3020         }
3021         if (tmpDur == Durability.SKIP_WAL) {
3022           recordMutationWithoutWal(m.getFamilyCellMap());
3023           continue;
3024         }
3025 
3026         long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i);
3027         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
3028         // Given how nonces are originally written, these should be contiguous.
3029         // They don't have to be, it will still work, just write more WALEdits than needed.
3030         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
3031           if (walEdit.size() > 0) {
3032             assert isInReplay;
3033             if (!isInReplay) {
3034               throw new IOException("Multiple nonces per batch and not in replay");
3035             }
3036             // txid should always increase, so having the one from the last call is ok.
3037             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3038             walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3039               this.htableDescriptor.getTableName(), now, m.getClusterIds(),
3040               currentNonceGroup, currentNonce);
3041             txid = this.wal.append(this.htableDescriptor,  this.getRegionInfo(),  walKey,
3042               walEdit, getSequenceId(), true, null);
3043             walEdit = new WALEdit(isInReplay);
3044             walKey = null;
3045           }
3046           currentNonceGroup = nonceGroup;
3047           currentNonce = nonce;
3048         }
3049 
3050         // Add WAL edits by CP
3051         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3052         if (fromCP != null) {
3053           for (Cell cell : fromCP.getCells()) {
3054             walEdit.add(cell);
3055           }
3056         }
3057         addFamilyMapToWALEdit(familyMaps[i], walEdit);
3058       }
3059 
3060       // -------------------------
3061       // STEP 5. Append the final edit to WAL. Do not sync wal.
3062       // -------------------------
3063       Mutation mutation = batchOp.getMutation(firstIndex);
3064       if (isInReplay) {
3065         // use wal key from the original
3066         walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3067           this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3068           mutation.getClusterIds(), currentNonceGroup, currentNonce);
3069         long replaySeqId = batchOp.getReplaySequenceId();
3070         walKey.setOrigLogSeqNum(replaySeqId);
3071 
3072         // ensure that the sequence id of the region is at least as big as orig log seq id
3073         while (true) {
3074           long seqId = getSequenceId().get();
3075           if (seqId >= replaySeqId) break;
3076           if (getSequenceId().compareAndSet(seqId, replaySeqId)) break;
3077         }
3078       }
3079       if (walEdit.size() > 0) {
3080         if (!isInReplay) {
3081         // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3082         walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3083             this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3084             mutation.getClusterIds(), currentNonceGroup, currentNonce);
3085         }
3086 
3087         txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit,
3088           getSequenceId(), true, memstoreCells);
3089       }
3090       if(walKey == null){
3091         // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
3092         walKey = this.appendEmptyEdit(this.wal, memstoreCells);
3093       }
3094 
3095       // -------------------------------
3096       // STEP 6. Release row locks, etc.
3097       // -------------------------------
3098       if (locked) {
3099         this.updatesLock.readLock().unlock();
3100         locked = false;
3101       }
3102       releaseRowLocks(acquiredRowLocks);
3103 
3104       // -------------------------
3105       // STEP 7. Sync wal.
3106       // -------------------------
3107       if (txid != 0) {
3108         syncOrDefer(txid, durability);
3109       }
3110 
3111       doRollBackMemstore = false;
3112       // calling the post CP hook for batch mutation
3113       if (!isInReplay && coprocessorHost != null) {
3114         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3115           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3116           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3117         coprocessorHost.postBatchMutate(miniBatchOp);
3118       }
3119 
3120 
3121       // ------------------------------------------------------------------
3122       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
3123       // ------------------------------------------------------------------
3124       if (w != null) {
3125         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
3126         w = null;
3127       }
3128 
3129       // ------------------------------------
3130       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
3131       // synced so that the coprocessor contract is adhered to.
3132       // ------------------------------------
3133       if (!isInReplay && coprocessorHost != null) {
3134         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3135           // only for successful puts
3136           if (batchOp.retCodeDetails[i].getOperationStatusCode()
3137               != OperationStatusCode.SUCCESS) {
3138             continue;
3139           }
3140           Mutation m = batchOp.getMutation(i);
3141           if (m instanceof Put) {
3142             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
3143           } else {
3144             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
3145           }
3146         }
3147       }
3148 
3149       success = true;
3150       return addedSize;
3151     } finally {
3152       // if the wal sync was unsuccessful, remove keys from memstore
3153       if (doRollBackMemstore) {
3154         rollbackMemstore(memstoreCells);
3155       }
3156       if (w != null) {
3157         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
3158       }
3159 
3160       if (locked) {
3161         this.updatesLock.readLock().unlock();
3162       }
3163       releaseRowLocks(acquiredRowLocks);
3164 
3165       // See if the column families were consistent through the whole thing.
3166       // if they were then keep them. If they were not then pass a null.
3167       // null will be treated as unknown.
3168       // Total time taken might be involving Puts and Deletes.
3169       // Split the time for puts and deletes based on the total number of Puts and Deletes.
3170 
3171       if (noOfPuts > 0) {
3172         // There were some Puts in the batch.
3173         if (this.metricsRegion != null) {
3174           this.metricsRegion.updatePut();
3175         }
3176       }
3177       if (noOfDeletes > 0) {
3178         // There were some Deletes in the batch.
3179         if (this.metricsRegion != null) {
3180           this.metricsRegion.updateDelete();
3181         }
3182       }
3183       if (!success) {
3184         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3185           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
3186             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
3187           }
3188         }
3189       }
3190       if (coprocessorHost != null && !batchOp.isInReplay()) {
3191         // call the coprocessor hook to do any finalization steps
3192         // after the put is done
3193         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3194             new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3195                 batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex,
3196                 lastIndexExclusive);
3197         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
3198       }
3199 
3200       batchOp.nextIndexToProcess = lastIndexExclusive;
3201     }
3202   }
3203 
3204   /**
3205    * Returns effective durability from the passed durability and
3206    * the table descriptor.
3207    */
3208   protected Durability getEffectiveDurability(Durability d) {
3209     return d == Durability.USE_DEFAULT ? this.durability : d;
3210   }
3211 
3212   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3213   //the getting of the lock happens before, so that you would just pass it into
3214   //the methods. So in the case of checkAndMutate you could just do lockRow,
3215   //get, put, unlockRow or something
3216 
3217   @Override
3218   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
3219       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
3220       boolean writeToWAL)
3221   throws IOException{
3222     checkReadOnly();
3223     //TODO, add check for value length or maybe even better move this to the
3224     //client if this becomes a global setting
3225     checkResources();
3226     boolean isPut = w instanceof Put;
3227     if (!isPut && !(w instanceof Delete))
3228       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
3229           "be Put or Delete");
3230     if (!Bytes.equals(row, w.getRow())) {
3231       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
3232           "getRow must match the passed row");
3233     }
3234 
3235     startRegionOperation();
3236     try {
3237       Get get = new Get(row);
3238       checkFamily(family);
3239       get.addColumn(family, qualifier);
3240 
3241       // Lock row - note that doBatchMutate will relock this row if called
3242       RowLock rowLock = getRowLock(get.getRow());
3243       // wait for all previous transactions to complete (with lock held)
3244       mvcc.waitForPreviousTransactionsComplete();
3245       try {
3246         if (this.getCoprocessorHost() != null) {
3247           Boolean processed = null;
3248           if (w instanceof Put) {
3249             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
3250                 qualifier, compareOp, comparator, (Put) w);
3251           } else if (w instanceof Delete) {
3252             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
3253                 qualifier, compareOp, comparator, (Delete) w);
3254           }
3255           if (processed != null) {
3256             return processed;
3257           }
3258         }
3259         List<Cell> result = get(get, false);
3260 
3261         boolean valueIsNull = comparator.getValue() == null ||
3262           comparator.getValue().length == 0;
3263         boolean matches = false;
3264         if (result.size() == 0 && valueIsNull) {
3265           matches = true;
3266         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3267             valueIsNull) {
3268           matches = true;
3269         } else if (result.size() == 1 && !valueIsNull) {
3270           Cell kv = result.get(0);
3271           int compareResult = comparator.compareTo(kv.getValueArray(),
3272               kv.getValueOffset(), kv.getValueLength());
3273           switch (compareOp) {
3274           case LESS:
3275             matches = compareResult < 0;
3276             break;
3277           case LESS_OR_EQUAL:
3278             matches = compareResult <= 0;
3279             break;
3280           case EQUAL:
3281             matches = compareResult == 0;
3282             break;
3283           case NOT_EQUAL:
3284             matches = compareResult != 0;
3285             break;
3286           case GREATER_OR_EQUAL:
3287             matches = compareResult >= 0;
3288             break;
3289           case GREATER:
3290             matches = compareResult > 0;
3291             break;
3292           default:
3293             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3294           }
3295         }
3296         //If matches put the new put or delete the new delete
3297         if (matches) {
3298           // All edits for the given row (across all column families) must
3299           // happen atomically.
3300           doBatchMutate(w);
3301           this.checkAndMutateChecksPassed.increment();
3302           return true;
3303         }
3304         this.checkAndMutateChecksFailed.increment();
3305         return false;
3306       } finally {
3307         rowLock.release();
3308       }
3309     } finally {
3310       closeRegionOperation();
3311     }
3312   }
3313 
3314   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3315   //the getting of the lock happens before, so that you would just pass it into
3316   //the methods. So in the case of checkAndMutate you could just do lockRow,
3317   //get, put, unlockRow or something
3318 
3319   @Override
3320   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3321       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
3322       boolean writeToWAL) throws IOException {
3323     checkReadOnly();
3324     //TODO, add check for value length or maybe even better move this to the
3325     //client if this becomes a global setting
3326     checkResources();
3327 
3328     startRegionOperation();
3329     try {
3330       Get get = new Get(row);
3331       checkFamily(family);
3332       get.addColumn(family, qualifier);
3333 
3334       // Lock row - note that doBatchMutate will relock this row if called
3335       RowLock rowLock = getRowLock(get.getRow());
3336       // wait for all previous transactions to complete (with lock held)
3337       mvcc.waitForPreviousTransactionsComplete();
3338       try {
3339         List<Cell> result = get(get, false);
3340 
3341         boolean valueIsNull = comparator.getValue() == null ||
3342             comparator.getValue().length == 0;
3343         boolean matches = false;
3344         if (result.size() == 0 && valueIsNull) {
3345           matches = true;
3346         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3347             valueIsNull) {
3348           matches = true;
3349         } else if (result.size() == 1 && !valueIsNull) {
3350           Cell kv = result.get(0);
3351           int compareResult = comparator.compareTo(kv.getValueArray(),
3352               kv.getValueOffset(), kv.getValueLength());
3353           switch (compareOp) {
3354           case LESS:
3355             matches = compareResult < 0;
3356             break;
3357           case LESS_OR_EQUAL:
3358             matches = compareResult <= 0;
3359             break;
3360           case EQUAL:
3361             matches = compareResult == 0;
3362             break;
3363           case NOT_EQUAL:
3364             matches = compareResult != 0;
3365             break;
3366           case GREATER_OR_EQUAL:
3367             matches = compareResult >= 0;
3368             break;
3369           case GREATER:
3370             matches = compareResult > 0;
3371             break;
3372           default:
3373             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3374           }
3375         }
3376         //If matches put the new put or delete the new delete
3377         if (matches) {
3378           // All edits for the given row (across all column families) must
3379           // happen atomically.
3380           mutateRow(rm);
3381           this.checkAndMutateChecksPassed.increment();
3382           return true;
3383         }
3384         this.checkAndMutateChecksFailed.increment();
3385         return false;
3386       } finally {
3387         rowLock.release();
3388       }
3389     } finally {
3390       closeRegionOperation();
3391     }
3392   }
3393 
3394   private void doBatchMutate(Mutation mutation) throws IOException {
3395     // Currently this is only called for puts and deletes, so no nonces.
3396     OperationStatus[] batchMutate = this.batchMutate(new Mutation[] { mutation });
3397     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
3398       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
3399     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
3400       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
3401     }
3402   }
3403 
3404   /**
3405    * Complete taking the snapshot on the region. Writes the region info and adds references to the
3406    * working snapshot directory.
3407    *
3408    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
3409    * arg.  (In the future other cancellable HRegion methods could eventually add a
3410    * {@link ForeignExceptionSnare}, or we could do something fancier).
3411    *
3412    * @param desc snapshot description object
3413    * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
3414    *   bail out.  This is allowed to be null and will just be ignored in that case.
3415    * @throws IOException if there is an external or internal error causing the snapshot to fail
3416    */
3417   public void addRegionToSnapshot(SnapshotDescription desc,
3418       ForeignExceptionSnare exnSnare) throws IOException {
3419     Path rootDir = FSUtils.getRootDir(conf);
3420     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
3421 
3422     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
3423                                                         snapshotDir, desc, exnSnare);
3424     manifest.addRegion(this);
3425   }
3426 
3427   @Override
3428   public void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now)
3429       throws IOException {
3430     for (List<Cell> cells: cellItr) {
3431       if (cells == null) continue;
3432       assert cells instanceof RandomAccess;
3433       int listSize = cells.size();
3434       for (int i = 0; i < listSize; i++) {
3435         CellUtil.updateLatestStamp(cells.get(i), now, 0);
3436       }
3437     }
3438   }
3439 
3440   /**
3441    * Possibly rewrite incoming cell tags.
3442    */
3443   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
3444     // Check if we have any work to do and early out otherwise
3445     // Update these checks as more logic is added here
3446 
3447     if (m.getTTL() == Long.MAX_VALUE) {
3448       return;
3449     }
3450 
3451     // From this point we know we have some work to do
3452 
3453     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
3454       List<Cell> cells = e.getValue();
3455       assert cells instanceof RandomAccess;
3456       int listSize = cells.size();
3457       for (int i = 0; i < listSize; i++) {
3458         Cell cell = cells.get(i);
3459         List<Tag> newTags = new ArrayList<Tag>();
3460         Iterator<Tag> tagIterator = CellUtil.tagsIterator(cell.getTagsArray(),
3461           cell.getTagsOffset(), cell.getTagsLength());
3462 
3463         // Carry forward existing tags
3464 
3465         while (tagIterator.hasNext()) {
3466 
3467           // Add any filters or tag specific rewrites here
3468 
3469           newTags.add(tagIterator.next());
3470         }
3471 
3472         // Cell TTL handling
3473 
3474         // Check again if we need to add a cell TTL because early out logic
3475         // above may change when there are more tag based features in core.
3476         if (m.getTTL() != Long.MAX_VALUE) {
3477           // Add a cell TTL tag
3478           newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(m.getTTL())));
3479         }
3480 
3481         // Rewrite the cell with the updated set of tags
3482 
3483         cells.set(i, new KeyValue(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
3484           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
3485           cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength(),
3486           cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
3487           cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
3488           newTags));
3489       }
3490     }
3491   }
3492 
3493   /*
3494    * Check if resources to support an update.
3495    *
3496    * We throw RegionTooBusyException if above memstore limit
3497    * and expect client to retry using some kind of backoff
3498   */
3499   private void checkResources() throws RegionTooBusyException {
3500     // If catalog region, do not impose resource constraints or block updates.
3501     if (this.getRegionInfo().isMetaRegion()) return;
3502 
3503     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3504       blockedRequestsCount.increment();
3505       requestFlush();
3506       throw new RegionTooBusyException("Above memstore limit, " +
3507           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3508           this.getRegionInfo().getRegionNameAsString()) +
3509           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3510           this.getRegionServerServices().getServerName()) +
3511           ", memstoreSize=" + memstoreSize.get() +
3512           ", blockingMemStoreSize=" + blockingMemStoreSize);
3513     }
3514   }
3515 
3516   /**
3517    * @throws IOException Throws exception if region is in read-only mode.
3518    */
3519   protected void checkReadOnly() throws IOException {
3520     if (isReadOnly()) {
3521       throw new DoNotRetryIOException("region is read only");
3522     }
3523   }
3524 
3525   protected void checkReadsEnabled() throws IOException {
3526     if (!this.writestate.readsEnabled) {
3527       throw new IOException(getRegionInfo().getEncodedName()
3528         + ": The region's reads are disabled. Cannot serve the request");
3529     }
3530   }
3531 
3532   public void setReadsEnabled(boolean readsEnabled) {
3533    if (readsEnabled && !this.writestate.readsEnabled) {
3534      LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
3535     }
3536     this.writestate.setReadsEnabled(readsEnabled);
3537   }
3538 
3539   /**
3540    * Add updates first to the wal and then add values to memstore.
3541    * Warning: Assumption is caller has lock on passed in row.
3542    * @param edits Cell updates by column
3543    * @throws IOException
3544    */
3545   private void put(final byte [] row, byte [] family, List<Cell> edits)
3546   throws IOException {
3547     NavigableMap<byte[], List<Cell>> familyMap;
3548     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3549 
3550     familyMap.put(family, edits);
3551     Put p = new Put(row);
3552     p.setFamilyCellMap(familyMap);
3553     doBatchMutate(p);
3554   }
3555 
3556   /**
3557    * Atomically apply the given map of family->edits to the memstore.
3558    * This handles the consistency control on its own, but the caller
3559    * should already have locked updatesLock.readLock(). This also does
3560    * <b>not</b> check the families for validity.
3561    *
3562    * @param familyMap Map of kvs per family
3563    * @param mvccNum The MVCC for this transaction.
3564    * @param isInReplay true when adding replayed KVs into memstore
3565    * @return the additional memory usage of the memstore caused by the
3566    * new entries.
3567    */
3568   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
3569     long mvccNum, List<Cell> memstoreCells, boolean isInReplay) throws IOException {
3570     long size = 0;
3571 
3572     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3573       byte[] family = e.getKey();
3574       List<Cell> cells = e.getValue();
3575       assert cells instanceof RandomAccess;
3576       Store store = getStore(family);
3577       int listSize = cells.size();
3578       for (int i=0; i < listSize; i++) {
3579         Cell cell = cells.get(i);
3580         CellUtil.setSequenceId(cell, mvccNum);
3581         Pair<Long, Cell> ret = store.add(cell);
3582         size += ret.getFirst();
3583         memstoreCells.add(ret.getSecond());
3584         if(isInReplay) {
3585           // set memstore newly added cells with replay mvcc number
3586           CellUtil.setSequenceId(ret.getSecond(), mvccNum);
3587         }
3588       }
3589     }
3590 
3591      return size;
3592    }
3593 
3594   /**
3595    * Remove all the keys listed in the map from the memstore. This method is
3596    * called when a Put/Delete has updated memstore but subsequently fails to update
3597    * the wal. This method is then invoked to rollback the memstore.
3598    */
3599   private void rollbackMemstore(List<Cell> memstoreCells) {
3600     int kvsRolledback = 0;
3601 
3602     for (Cell cell : memstoreCells) {
3603       byte[] family = CellUtil.cloneFamily(cell);
3604       Store store = getStore(family);
3605       store.rollback(cell);
3606       kvsRolledback++;
3607     }
3608     LOG.debug("rollbackMemstore rolled back " + kvsRolledback);
3609   }
3610 
3611   @Override
3612   public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException {
3613     for (byte[] family : families) {
3614       checkFamily(family);
3615     }
3616   }
3617 
3618   /**
3619    * During replay, there could exist column families which are removed between region server
3620    * failure and replay
3621    */
3622   private void removeNonExistentColumnFamilyForReplay(
3623       final Map<byte[], List<Cell>> familyMap) {
3624     List<byte[]> nonExistentList = null;
3625     for (byte[] family : familyMap.keySet()) {
3626       if (!this.htableDescriptor.hasFamily(family)) {
3627         if (nonExistentList == null) {
3628           nonExistentList = new ArrayList<byte[]>();
3629         }
3630         nonExistentList.add(family);
3631       }
3632     }
3633     if (nonExistentList != null) {
3634       for (byte[] family : nonExistentList) {
3635         // Perhaps schema was changed between crash and replay
3636         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3637         familyMap.remove(family);
3638       }
3639     }
3640   }
3641 
3642   @Override
3643   public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now)
3644       throws FailedSanityCheckException {
3645     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
3646       return;
3647     }
3648     long maxTs = now + timestampSlop;
3649     for (List<Cell> kvs : familyMap.values()) {
3650       assert kvs instanceof RandomAccess;
3651       int listSize  = kvs.size();
3652       for (int i=0; i < listSize; i++) {
3653         Cell cell = kvs.get(i);
3654         // see if the user-side TS is out of range. latest = server-side
3655         long ts = cell.getTimestamp();
3656         if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
3657           throw new FailedSanityCheckException("Timestamp for KV out of range "
3658               + cell + " (too.new=" + timestampSlop + ")");
3659         }
3660       }
3661     }
3662   }
3663 
3664   /**
3665    * Append the given map of family->edits to a WALEdit data structure.
3666    * This does not write to the WAL itself.
3667    * @param familyMap map of family->edits
3668    * @param walEdit the destination entry to append into
3669    */
3670   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
3671       WALEdit walEdit) {
3672     for (List<Cell> edits : familyMap.values()) {
3673       assert edits instanceof RandomAccess;
3674       int listSize = edits.size();
3675       for (int i=0; i < listSize; i++) {
3676         Cell cell = edits.get(i);
3677         walEdit.add(cell);
3678       }
3679     }
3680   }
3681 
3682   private void requestFlush() {
3683     if (this.rsServices == null) {
3684       return;
3685     }
3686     synchronized (writestate) {
3687       if (this.writestate.isFlushRequested()) {
3688         return;
3689       }
3690       writestate.flushRequested = true;
3691     }
3692     // Make request outside of synchronize block; HBASE-818.
3693     this.rsServices.getFlushRequester().requestFlush(this, false);
3694     if (LOG.isDebugEnabled()) {
3695       LOG.debug("Flush requested on " + this);
3696     }
3697   }
3698 
3699   /*
3700    * @param size
3701    * @return True if size is over the flush threshold
3702    */
3703   private boolean isFlushSize(final long size) {
3704     return size > this.memstoreFlushSize;
3705   }
3706 
3707   /**
3708    * Read the edits put under this region by wal splitting process.  Put
3709    * the recovered edits back up into this region.
3710    *
3711    * <p>We can ignore any wal message that has a sequence ID that's equal to or
3712    * lower than minSeqId.  (Because we know such messages are already
3713    * reflected in the HFiles.)
3714    *
3715    * <p>While this is running we are putting pressure on memory yet we are
3716    * outside of our usual accounting because we are not yet an onlined region
3717    * (this stuff is being run as part of Region initialization).  This means
3718    * that if we're up against global memory limits, we'll not be flagged to flush
3719    * because we are not online. We can't be flushed by usual mechanisms anyways;
3720    * we're not yet online so our relative sequenceids are not yet aligned with
3721    * WAL sequenceids -- not till we come up online, post processing of split
3722    * edits.
3723    *
3724    * <p>But to help relieve memory pressure, at least manage our own heap size
3725    * flushing if are in excess of per-region limits.  Flushing, though, we have
3726    * to be careful and avoid using the regionserver/wal sequenceid.  Its running
3727    * on a different line to whats going on in here in this region context so if we
3728    * crashed replaying these edits, but in the midst had a flush that used the
3729    * regionserver wal with a sequenceid in excess of whats going on in here
3730    * in this region and with its split editlogs, then we could miss edits the
3731    * next time we go to recover. So, we have to flush inline, using seqids that
3732    * make sense in a this single region context only -- until we online.
3733    *
3734    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
3735    * the maxSeqId for the store to be applied, else its skipped.
3736    * @return the sequence id of the last edit added to this region out of the
3737    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3738    * @throws UnsupportedEncodingException
3739    * @throws IOException
3740    */
3741   protected long replayRecoveredEditsIfAny(final Path regiondir,
3742       Map<byte[], Long> maxSeqIdInStores,
3743       final CancelableProgressable reporter, final MonitoredTask status)
3744       throws IOException {
3745     long minSeqIdForTheRegion = -1;
3746     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3747       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3748         minSeqIdForTheRegion = maxSeqIdInStore;
3749       }
3750     }
3751     long seqid = minSeqIdForTheRegion;
3752 
3753     FileSystem fs = this.fs.getFileSystem();
3754     NavigableSet<Path> files = WALSplitter.getSplitEditFilesSorted(fs, regiondir);
3755     if (LOG.isDebugEnabled()) {
3756       LOG.debug("Found " + (files == null ? 0 : files.size())
3757         + " recovered edits file(s) under " + regiondir);
3758     }
3759 
3760     if (files == null || files.isEmpty()) return seqid;
3761 
3762     for (Path edits: files) {
3763       if (edits == null || !fs.exists(edits)) {
3764         LOG.warn("Null or non-existent edits file: " + edits);
3765         continue;
3766       }
3767       if (isZeroLengthThenDelete(fs, edits)) continue;
3768 
3769       long maxSeqId;
3770       String fileName = edits.getName();
3771       maxSeqId = Math.abs(Long.parseLong(fileName));
3772       if (maxSeqId <= minSeqIdForTheRegion) {
3773         if (LOG.isDebugEnabled()) {
3774           String msg = "Maximum sequenceid for this wal is " + maxSeqId
3775             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3776             + ", skipped the whole file, path=" + edits;
3777           LOG.debug(msg);
3778         }
3779         continue;
3780       }
3781 
3782       try {
3783         // replay the edits. Replay can return -1 if everything is skipped, only update
3784         // if seqId is greater
3785         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
3786       } catch (IOException e) {
3787         boolean skipErrors = conf.getBoolean(
3788             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3789             conf.getBoolean(
3790                 "hbase.skip.errors",
3791                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
3792         if (conf.get("hbase.skip.errors") != null) {
3793           LOG.warn(
3794               "The property 'hbase.skip.errors' has been deprecated. Please use " +
3795               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
3796         }
3797         if (skipErrors) {
3798           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
3799           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
3800               + "=true so continuing. Renamed " + edits +
3801               " as " + p, e);
3802         } else {
3803           throw e;
3804         }
3805       }
3806     }
3807     // The edits size added into rsAccounting during this replaying will not
3808     // be required any more. So just clear it.
3809     if (this.rsAccounting != null) {
3810       this.rsAccounting.clearRegionReplayEditsSize(getRegionInfo().getRegionName());
3811     }
3812     if (seqid > minSeqIdForTheRegion) {
3813       // Then we added some edits to memory. Flush and cleanup split edit files.
3814       internalFlushcache(null, seqid, stores.values(), status, false);
3815     }
3816     // Now delete the content of recovered edits.  We're done w/ them.
3817     if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
3818       // For debugging data loss issues!
3819       // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
3820       // column family. Have to fake out file type too by casting our recovered.edits as storefiles
3821       String fakeFamilyName = WALSplitter.getRegionDirRecoveredEditsDir(regiondir).getName();
3822       Set<StoreFile> fakeStoreFiles = new HashSet<StoreFile>(files.size());
3823       for (Path file: files) {
3824         fakeStoreFiles.add(new StoreFile(getRegionFileSystem().getFileSystem(), file, this.conf,
3825           null, null));
3826       }
3827       getRegionFileSystem().removeStoreFiles(fakeFamilyName, fakeStoreFiles);
3828     } else {
3829       for (Path file: files) {
3830         if (!fs.delete(file, false)) {
3831           LOG.error("Failed delete of " + file);
3832         } else {
3833           LOG.debug("Deleted recovered.edits file=" + file);
3834         }
3835       }
3836     }
3837     return seqid;
3838   }
3839 
3840   /*
3841    * @param edits File of recovered edits.
3842    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in wal
3843    * must be larger than this to be replayed for each store.
3844    * @param reporter
3845    * @return the sequence id of the last edit added to this region out of the
3846    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3847    * @throws IOException
3848    */
3849   private long replayRecoveredEdits(final Path edits,
3850       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
3851     throws IOException {
3852     String msg = "Replaying edits from " + edits;
3853     LOG.info(msg);
3854     MonitoredTask status = TaskMonitor.get().createStatus(msg);
3855     FileSystem fs = this.fs.getFileSystem();
3856 
3857     status.setStatus("Opening recovered edits");
3858     WAL.Reader reader = null;
3859     try {
3860       reader = WALFactory.createReader(fs, edits, conf);
3861       long currentEditSeqId = -1;
3862       long currentReplaySeqId = -1;
3863       long firstSeqIdInLog = -1;
3864       long skippedEdits = 0;
3865       long editsCount = 0;
3866       long intervalEdits = 0;
3867       WAL.Entry entry;
3868       Store store = null;
3869       boolean reported_once = false;
3870       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
3871 
3872       try {
3873         // How many edits seen before we check elapsed time
3874         int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
3875         // How often to send a progress report (default 1/2 master timeout)
3876         int period = this.conf.getInt("hbase.hstore.report.period", 300000);
3877         long lastReport = EnvironmentEdgeManager.currentTime();
3878 
3879         while ((entry = reader.next()) != null) {
3880           WALKey key = entry.getKey();
3881           WALEdit val = entry.getEdit();
3882 
3883           if (ng != null) { // some test, or nonces disabled
3884             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
3885           }
3886 
3887           if (reporter != null) {
3888             intervalEdits += val.size();
3889             if (intervalEdits >= interval) {
3890               // Number of edits interval reached
3891               intervalEdits = 0;
3892               long cur = EnvironmentEdgeManager.currentTime();
3893               if (lastReport + period <= cur) {
3894                 status.setStatus("Replaying edits..." +
3895                     " skipped=" + skippedEdits +
3896                     " edits=" + editsCount);
3897                 // Timeout reached
3898                 if(!reporter.progress()) {
3899                   msg = "Progressable reporter failed, stopping replay";
3900                   LOG.warn(msg);
3901                   status.abort(msg);
3902                   throw new IOException(msg);
3903                 }
3904                 reported_once = true;
3905                 lastReport = cur;
3906               }
3907             }
3908           }
3909 
3910           if (firstSeqIdInLog == -1) {
3911             firstSeqIdInLog = key.getLogSeqNum();
3912           }
3913           if (currentEditSeqId > key.getLogSeqNum()) {
3914             // when this condition is true, it means we have a serious defect because we need to
3915             // maintain increasing SeqId for WAL edits per region
3916             LOG.error(getRegionInfo().getEncodedName() + " : "
3917                  + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
3918                 + "; edit=" + val);
3919           } else {
3920             currentEditSeqId = key.getLogSeqNum();
3921           }
3922           currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
3923             key.getOrigLogSeqNum() : currentEditSeqId;
3924 
3925           // Start coprocessor replay here. The coprocessor is for each WALEdit
3926           // instead of a KeyValue.
3927           if (coprocessorHost != null) {
3928             status.setStatus("Running pre-WAL-restore hook in coprocessors");
3929             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
3930               // if bypass this wal entry, ignore it ...
3931               continue;
3932             }
3933           }
3934           // Check this edit is for this region.
3935           if (!Bytes.equals(key.getEncodedRegionName(),
3936               this.getRegionInfo().getEncodedNameAsBytes())) {
3937             skippedEdits++;
3938             continue;
3939           }
3940 
3941           boolean flush = false;
3942           for (Cell cell: val.getCells()) {
3943             // Check this edit is for me. Also, guard against writing the special
3944             // METACOLUMN info such as HBASE::CACHEFLUSH entries
3945             if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
3946               //this is a special edit, we should handle it
3947               CompactionDescriptor compaction = WALEdit.getCompaction(cell);
3948               if (compaction != null) {
3949                 //replay the compaction
3950                 replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
3951               }
3952               skippedEdits++;
3953               continue;
3954             }
3955             // Figure which store the edit is meant for.
3956             if (store == null || !CellUtil.matchingFamily(cell, store.getFamily().getName())) {
3957               store = getStore(cell);
3958             }
3959             if (store == null) {
3960               // This should never happen.  Perhaps schema was changed between
3961               // crash and redeploy?
3962               LOG.warn("No family for " + cell);
3963               skippedEdits++;
3964               continue;
3965             }
3966             // Now, figure if we should skip this edit.
3967             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
3968                 .getName())) {
3969               skippedEdits++;
3970               continue;
3971             }
3972             CellUtil.setSequenceId(cell, currentReplaySeqId);
3973 
3974             // Once we are over the limit, restoreEdit will keep returning true to
3975             // flush -- but don't flush until we've played all the kvs that make up
3976             // the WALEdit.
3977             flush |= restoreEdit(store, cell);
3978             editsCount++;
3979           }
3980           if (flush) {
3981             internalFlushcache(null, currentEditSeqId, stores.values(), status, false);
3982           }
3983 
3984           if (coprocessorHost != null) {
3985             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
3986           }
3987         }
3988       } catch (EOFException eof) {
3989         Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
3990         msg = "Encountered EOF. Most likely due to Master failure during " +
3991             "wal splitting, so we have this data in another edit.  " +
3992             "Continuing, but renaming " + edits + " as " + p;
3993         LOG.warn(msg, eof);
3994         status.abort(msg);
3995       } catch (IOException ioe) {
3996         // If the IOE resulted from bad file format,
3997         // then this problem is idempotent and retrying won't help
3998         if (ioe.getCause() instanceof ParseException) {
3999           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4000           msg = "File corruption encountered!  " +
4001               "Continuing, but renaming " + edits + " as " + p;
4002           LOG.warn(msg, ioe);
4003           status.setStatus(msg);
4004         } else {
4005           status.abort(StringUtils.stringifyException(ioe));
4006           // other IO errors may be transient (bad network connection,
4007           // checksum exception on one datanode, etc).  throw & retry
4008           throw ioe;
4009         }
4010       }
4011       if (reporter != null && !reported_once) {
4012         reporter.progress();
4013       }
4014       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
4015         ", firstSequenceIdInLog=" + firstSeqIdInLog +
4016         ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
4017       status.markComplete(msg);
4018       LOG.debug(msg);
4019       return currentEditSeqId;
4020     } finally {
4021       status.cleanup();
4022       if (reader != null) {
4023          reader.close();
4024       }
4025     }
4026   }
4027 
4028   /**
4029    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
4030    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
4031    * See HBASE-2331.
4032    */
4033   void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
4034       boolean removeFiles, long replaySeqId)
4035       throws IOException {
4036     checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
4037       "Compaction marker from WAL ", compaction);
4038 
4039     synchronized (writestate) {
4040       if (replaySeqId < lastReplayedOpenRegionSeqId) {
4041         LOG.warn(getRegionInfo().getEncodedName() + " : "
4042             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4043             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4044             + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4045         return;
4046       }
4047       if (replaySeqId < lastReplayedCompactionSeqId) {
4048         LOG.warn(getRegionInfo().getEncodedName() + " : "
4049             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4050             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4051             + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
4052         return;
4053       } else {
4054         lastReplayedCompactionSeqId = replaySeqId;
4055       }
4056 
4057       if (LOG.isDebugEnabled()) {
4058         LOG.debug(getRegionInfo().getEncodedName() + " : "
4059             + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
4060             + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
4061             + lastReplayedOpenRegionSeqId);
4062       }
4063 
4064       startRegionOperation(Operation.REPLAY_EVENT);
4065       try {
4066         Store store = this.getStore(compaction.getFamilyName().toByteArray());
4067         if (store == null) {
4068           LOG.warn(getRegionInfo().getEncodedName() + " : "
4069               + "Found Compaction WAL edit for deleted family:"
4070               + Bytes.toString(compaction.getFamilyName().toByteArray()));
4071           return;
4072         }
4073         store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
4074         logRegionFiles();
4075       } catch (FileNotFoundException ex) {
4076         LOG.warn(getRegionInfo().getEncodedName() + " : "
4077             + "At least one of the store files in compaction: "
4078             + TextFormat.shortDebugString(compaction)
4079             + " doesn't exist any more. Skip loading the file(s)", ex);
4080       } finally {
4081         closeRegionOperation(Operation.REPLAY_EVENT);
4082       }
4083     }
4084   }
4085 
4086   void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
4087     checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
4088       "Flush marker from WAL ", flush);
4089 
4090     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4091       return; // if primary nothing to do
4092     }
4093 
4094     if (LOG.isDebugEnabled()) {
4095       LOG.debug(getRegionInfo().getEncodedName() + " : "
4096           + "Replaying flush marker " + TextFormat.shortDebugString(flush));
4097     }
4098 
4099     startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
4100     try {
4101       FlushAction action = flush.getAction();
4102       switch (action) {
4103       case START_FLUSH:
4104         replayWALFlushStartMarker(flush);
4105         break;
4106       case COMMIT_FLUSH:
4107         replayWALFlushCommitMarker(flush);
4108         break;
4109       case ABORT_FLUSH:
4110         replayWALFlushAbortMarker(flush);
4111         break;
4112       case CANNOT_FLUSH:
4113         replayWALFlushCannotFlushMarker(flush, replaySeqId);
4114         break;
4115       default:
4116         LOG.warn(getRegionInfo().getEncodedName() + " : " +
4117           "Received a flush event with unknown action, ignoring. " +
4118           TextFormat.shortDebugString(flush));
4119         break;
4120       }
4121 
4122       logRegionFiles();
4123     } finally {
4124       closeRegionOperation(Operation.REPLAY_EVENT);
4125     }
4126   }
4127 
4128   /** Replay the flush marker from primary region by creating a corresponding snapshot of
4129    * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
4130    * edit (because the events may be coming out of order).
4131    */
4132   @VisibleForTesting
4133   PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
4134     long flushSeqId = flush.getFlushSequenceNumber();
4135 
4136     HashSet<Store> storesToFlush = new HashSet<Store>();
4137     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4138       byte[] family = storeFlush.getFamilyName().toByteArray();
4139       Store store = getStore(family);
4140       if (store == null) {
4141         LOG.warn(getRegionInfo().getEncodedName() + " : "
4142           + "Received a flush start marker from primary, but the family is not found. Ignoring"
4143           + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
4144         continue;
4145       }
4146       storesToFlush.add(store);
4147     }
4148 
4149     MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
4150 
4151     // we will use writestate as a coarse-grain lock for all the replay events
4152     // (flush, compaction, region open etc)
4153     synchronized (writestate) {
4154       try {
4155         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4156           LOG.warn(getRegionInfo().getEncodedName() + " : "
4157               + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4158               + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4159               + " of " + lastReplayedOpenRegionSeqId);
4160           return null;
4161         }
4162         if (numMutationsWithoutWAL.get() > 0) {
4163           numMutationsWithoutWAL.set(0);
4164           dataInMemoryWithoutWAL.set(0);
4165         }
4166 
4167         if (!writestate.flushing) {
4168           // we do not have an active snapshot and corresponding this.prepareResult. This means
4169           // we can just snapshot our memstores and continue as normal.
4170 
4171           // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
4172           PrepareFlushResult prepareResult = internalPrepareFlushCache(null,
4173             flushSeqId, storesToFlush, status, false);
4174           if (prepareResult.result == null) {
4175             // save the PrepareFlushResult so that we can use it later from commit flush
4176             this.writestate.flushing = true;
4177             this.prepareFlushResult = prepareResult;
4178             status.markComplete("Flush prepare successful");
4179             if (LOG.isDebugEnabled()) {
4180               LOG.debug(getRegionInfo().getEncodedName() + " : "
4181                   + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
4182             }
4183           } else {
4184             // special case empty memstore. We will still save the flush result in this case, since
4185             // our memstore ie empty, but the primary is still flushing
4186             if (prepareResult.getResult().getResult() ==
4187                   FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
4188               this.writestate.flushing = true;
4189               this.prepareFlushResult = prepareResult;
4190               if (LOG.isDebugEnabled()) {
4191                 LOG.debug(getRegionInfo().getEncodedName() + " : "
4192                   + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
4193               }
4194             }
4195             status.abort("Flush prepare failed with " + prepareResult.result);
4196             // nothing much to do. prepare flush failed because of some reason.
4197           }
4198           return prepareResult;
4199         } else {
4200           // we already have an active snapshot.
4201           if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
4202             // They define the same flush. Log and continue.
4203             LOG.warn(getRegionInfo().getEncodedName() + " : "
4204                 + "Received a flush prepare marker with the same seqId: " +
4205                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4206                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4207             // ignore
4208           } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
4209             // We received a flush with a smaller seqNum than what we have prepared. We can only
4210             // ignore this prepare flush request.
4211             LOG.warn(getRegionInfo().getEncodedName() + " : "
4212                 + "Received a flush prepare marker with a smaller seqId: " +
4213                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4214                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4215             // ignore
4216           } else {
4217             // We received a flush with a larger seqNum than what we have prepared
4218             LOG.warn(getRegionInfo().getEncodedName() + " : "
4219                 + "Received a flush prepare marker with a larger seqId: " +
4220                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4221                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4222             // We do not have multiple active snapshots in the memstore or a way to merge current
4223             // memstore snapshot with the contents and resnapshot for now. We cannot take
4224             // another snapshot and drop the previous one because that will cause temporary
4225             // data loss in the secondary. So we ignore this for now, deferring the resolution
4226             // to happen when we see the corresponding flush commit marker. If we have a memstore
4227             // snapshot with x, and later received another prepare snapshot with y (where x < y),
4228             // when we see flush commit for y, we will drop snapshot for x, and can also drop all
4229             // the memstore edits if everything in memstore is < y. This is the usual case for
4230             // RS crash + recovery where we might see consequtive prepare flush wal markers.
4231             // Otherwise, this will cause more memory to be used in secondary replica until a
4232             // further prapare + commit flush is seen and replayed.
4233           }
4234         }
4235       } finally {
4236         status.cleanup();
4237         writestate.notifyAll();
4238       }
4239     }
4240     return null;
4241   }
4242 
4243   @VisibleForTesting
4244   void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
4245     MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
4246 
4247     // check whether we have the memstore snapshot with the corresponding seqId. Replay to
4248     // secondary region replicas are in order, except for when the region moves or then the
4249     // region server crashes. In those cases, we may receive replay requests out of order from
4250     // the original seqIds.
4251     synchronized (writestate) {
4252       try {
4253         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4254           LOG.warn(getRegionInfo().getEncodedName() + " : "
4255             + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4256             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4257             + " of " + lastReplayedOpenRegionSeqId);
4258           return;
4259         }
4260 
4261         if (writestate.flushing) {
4262           PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
4263           if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
4264             if (LOG.isDebugEnabled()) {
4265               LOG.debug(getRegionInfo().getEncodedName() + " : "
4266                   + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4267                   + " and a previous prepared snapshot was found");
4268             }
4269             // This is the regular case where we received commit flush after prepare flush
4270             // corresponding to the same seqId.
4271             replayFlushInStores(flush, prepareFlushResult, true);
4272 
4273             // Set down the memstore size by amount of flush.
4274             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4275 
4276             this.prepareFlushResult = null;
4277             writestate.flushing = false;
4278           } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
4279             // This should not happen normally. However, lets be safe and guard against these cases
4280             // we received a flush commit with a smaller seqId than what we have prepared
4281             // we will pick the flush file up from this commit (if we have not seen it), but we
4282             // will not drop the memstore
4283             LOG.warn(getRegionInfo().getEncodedName() + " : "
4284                 + "Received a flush commit marker with smaller seqId: "
4285                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
4286                 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
4287                 +"  prepared memstore snapshot");
4288             replayFlushInStores(flush, prepareFlushResult, false);
4289 
4290             // snapshot is not dropped, so memstore sizes should not be decremented
4291             // we still have the prepared snapshot, flushing should still be true
4292           } else {
4293             // This should not happen normally. However, lets be safe and guard against these cases
4294             // we received a flush commit with a larger seqId than what we have prepared
4295             // we will pick the flush file for this. We will also obtain the updates lock and
4296             // look for contents of the memstore to see whether we have edits after this seqId.
4297             // If not, we will drop all the memstore edits and the snapshot as well.
4298             LOG.warn(getRegionInfo().getEncodedName() + " : "
4299                 + "Received a flush commit marker with larger seqId: "
4300                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
4301                 prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
4302                 +" memstore snapshot");
4303 
4304             replayFlushInStores(flush, prepareFlushResult, true);
4305 
4306             // Set down the memstore size by amount of flush.
4307             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4308 
4309             // Inspect the memstore contents to see whether the memstore contains only edits
4310             // with seqId smaller than the flush seqId. If so, we can discard those edits.
4311             dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4312 
4313             this.prepareFlushResult = null;
4314             writestate.flushing = false;
4315           }
4316           // If we were waiting for observing a flush or region opening event for not showing
4317           // partial data after a secondary region crash, we can allow reads now. We can only make
4318           // sure that we are not showing partial data (for example skipping some previous edits)
4319           // until we observe a full flush start and flush commit. So if we were not able to find
4320           // a previous flush we will not enable reads now.
4321           this.setReadsEnabled(true);
4322         } else {
4323           LOG.warn(getRegionInfo().getEncodedName() + " : "
4324               + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4325               + ", but no previous prepared snapshot was found");
4326           // There is no corresponding prepare snapshot from before.
4327           // We will pick up the new flushed file
4328           replayFlushInStores(flush, null, false);
4329 
4330           // Inspect the memstore contents to see whether the memstore contains only edits
4331           // with seqId smaller than the flush seqId. If so, we can discard those edits.
4332           dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4333         }
4334 
4335         status.markComplete("Flush commit successful");
4336 
4337         // Update the last flushed sequence id for region.
4338         this.maxFlushedSeqId = flush.getFlushSequenceNumber();
4339 
4340         // advance the mvcc read point so that the new flushed file is visible.
4341         // there may be some in-flight transactions, but they won't be made visible since they are
4342         // either greater than flush seq number or they were already dropped via flush.
4343         // TODO: If we are using FlushAllStoresPolicy, then this can make edits visible from other
4344         // stores while they are still in flight because the flush commit marker will not contain
4345         // flushes from ALL stores.
4346         getMVCC().advanceMemstoreReadPointIfNeeded(flush.getFlushSequenceNumber());
4347 
4348       } catch (FileNotFoundException ex) {
4349         LOG.warn(getRegionInfo().getEncodedName() + " : "
4350             + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
4351             + " doesn't exist any more. Skip loading the file(s)", ex);
4352       }
4353       finally {
4354         status.cleanup();
4355         writestate.notifyAll();
4356       }
4357     }
4358 
4359     // C. Finally notify anyone waiting on memstore to clear:
4360     // e.g. checkResources().
4361     synchronized (this) {
4362       notifyAll(); // FindBugs NN_NAKED_NOTIFY
4363     }
4364   }
4365 
4366   /**
4367    * Replays the given flush descriptor by opening the flush files in stores and dropping the
4368    * memstore snapshots if requested.
4369    * @param flush
4370    * @param prepareFlushResult
4371    * @param dropMemstoreSnapshot
4372    * @throws IOException
4373    */
4374   private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
4375       boolean dropMemstoreSnapshot)
4376       throws IOException {
4377     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4378       byte[] family = storeFlush.getFamilyName().toByteArray();
4379       Store store = getStore(family);
4380       if (store == null) {
4381         LOG.warn(getRegionInfo().getEncodedName() + " : "
4382             + "Received a flush commit marker from primary, but the family is not found."
4383             + "Ignoring StoreFlushDescriptor:" + storeFlush);
4384         continue;
4385       }
4386       List<String> flushFiles = storeFlush.getFlushOutputList();
4387       StoreFlushContext ctx = null;
4388       long startTime = EnvironmentEdgeManager.currentTime();
4389       if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
4390         ctx = store.createFlushContext(flush.getFlushSequenceNumber());
4391       } else {
4392         ctx = prepareFlushResult.storeFlushCtxs.get(family);
4393         startTime = prepareFlushResult.startTime;
4394       }
4395 
4396       if (ctx == null) {
4397         LOG.warn(getRegionInfo().getEncodedName() + " : "
4398             + "Unexpected: flush commit marker received from store "
4399             + Bytes.toString(family) + " but no associated flush context. Ignoring");
4400         continue;
4401       }
4402 
4403       ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
4404 
4405       // Record latest flush time
4406       this.lastStoreFlushTimeMap.put(store, startTime);
4407     }
4408   }
4409 
4410   /**
4411    * Drops the memstore contents after replaying a flush descriptor or region open event replay
4412    * if the memstore edits have seqNums smaller than the given seq id
4413    * @param flush the flush descriptor
4414    * @throws IOException
4415    */
4416   private long dropMemstoreContentsForSeqId(long seqId, Store store) throws IOException {
4417     long totalFreedSize = 0;
4418     this.updatesLock.writeLock().lock();
4419     try {
4420       mvcc.waitForPreviousTransactionsComplete();
4421       long currentSeqId = getSequenceId().get();
4422       if (seqId >= currentSeqId) {
4423         // then we can drop the memstore contents since everything is below this seqId
4424         LOG.info(getRegionInfo().getEncodedName() + " : "
4425             + "Dropping memstore contents as well since replayed flush seqId: "
4426             + seqId + " is greater than current seqId:" + currentSeqId);
4427 
4428         // Prepare flush (take a snapshot) and then abort (drop the snapshot)
4429         if (store == null ) {
4430           for (Store s : stores.values()) {
4431             totalFreedSize += doDropStoreMemstoreContentsForSeqId(s, currentSeqId);
4432           }
4433         } else {
4434           totalFreedSize += doDropStoreMemstoreContentsForSeqId(store, currentSeqId);
4435         }
4436       } else {
4437         LOG.info(getRegionInfo().getEncodedName() + " : "
4438             + "Not dropping memstore contents since replayed flush seqId: "
4439             + seqId + " is smaller than current seqId:" + currentSeqId);
4440       }
4441     } finally {
4442       this.updatesLock.writeLock().unlock();
4443     }
4444     return totalFreedSize;
4445   }
4446 
4447   private long doDropStoreMemstoreContentsForSeqId(Store s, long currentSeqId) throws IOException {
4448     long snapshotSize = s.getFlushableSize();
4449     this.addAndGetGlobalMemstoreSize(-snapshotSize);
4450     StoreFlushContext ctx = s.createFlushContext(currentSeqId);
4451     ctx.prepare();
4452     ctx.abort();
4453     return snapshotSize;
4454   }
4455 
4456   private void replayWALFlushAbortMarker(FlushDescriptor flush) {
4457     // nothing to do for now. A flush abort will cause a RS abort which means that the region
4458     // will be opened somewhere else later. We will see the region open event soon, and replaying
4459     // that will drop the snapshot
4460   }
4461 
4462   private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
4463     synchronized (writestate) {
4464       if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
4465         LOG.warn(getRegionInfo().getEncodedName() + " : "
4466           + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4467           + " because its sequence id " + replaySeqId + " is smaller than this regions "
4468           + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4469         return;
4470       }
4471 
4472       // If we were waiting for observing a flush or region opening event for not showing partial
4473       // data after a secondary region crash, we can allow reads now. This event means that the
4474       // primary was not able to flush because memstore is empty when we requested flush. By the
4475       // time we observe this, we are guaranteed to have up to date seqId with our previous
4476       // assignment.
4477       this.setReadsEnabled(true);
4478     }
4479   }
4480 
4481   @VisibleForTesting
4482   PrepareFlushResult getPrepareFlushResult() {
4483     return prepareFlushResult;
4484   }
4485 
4486   void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
4487     checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
4488       "RegionEvent marker from WAL ", regionEvent);
4489 
4490     startRegionOperation(Operation.REPLAY_EVENT);
4491     try {
4492       if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4493         return; // if primary nothing to do
4494       }
4495 
4496       if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
4497         // nothing to do on REGION_CLOSE for now.
4498         return;
4499       }
4500       if (regionEvent.getEventType() != EventType.REGION_OPEN) {
4501         LOG.warn(getRegionInfo().getEncodedName() + " : "
4502             + "Unknown region event received, ignoring :"
4503             + TextFormat.shortDebugString(regionEvent));
4504         return;
4505       }
4506 
4507       if (LOG.isDebugEnabled()) {
4508         LOG.debug(getRegionInfo().getEncodedName() + " : "
4509           + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
4510       }
4511 
4512       // we will use writestate as a coarse-grain lock for all the replay events
4513       synchronized (writestate) {
4514         // Replication can deliver events out of order when primary region moves or the region
4515         // server crashes, since there is no coordination between replication of different wal files
4516         // belonging to different region servers. We have to safe guard against this case by using
4517         // region open event's seqid. Since this is the first event that the region puts (after
4518         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4519         // smaller than this seqId
4520         if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
4521           this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
4522         } else {
4523           LOG.warn(getRegionInfo().getEncodedName() + " : "
4524             + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
4525             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4526             + " of " + lastReplayedOpenRegionSeqId);
4527           return;
4528         }
4529 
4530         // region open lists all the files that the region has at the time of the opening. Just pick
4531         // all the files and drop prepared flushes and empty memstores
4532         for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
4533           // stores of primary may be different now
4534           byte[] family = storeDescriptor.getFamilyName().toByteArray();
4535           Store store = getStore(family);
4536           if (store == null) {
4537             LOG.warn(getRegionInfo().getEncodedName() + " : "
4538                 + "Received a region open marker from primary, but the family is not found. "
4539                 + "Ignoring. StoreDescriptor:" + storeDescriptor);
4540             continue;
4541           }
4542 
4543           long storeSeqId = store.getMaxSequenceId();
4544           List<String> storeFiles = storeDescriptor.getStoreFileList();
4545           try {
4546             store.refreshStoreFiles(storeFiles); // replace the files with the new ones
4547           } catch (FileNotFoundException ex) {
4548             LOG.warn(getRegionInfo().getEncodedName() + " : "
4549                     + "At least one of the store files: " + storeFiles
4550                     + " doesn't exist any more. Skip loading the file(s)", ex);
4551             continue;
4552           }
4553           if (store.getMaxSequenceId() != storeSeqId) {
4554             // Record latest flush time if we picked up new files
4555             lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
4556           }
4557 
4558           if (writestate.flushing) {
4559             // only drop memstore snapshots if they are smaller than last flush for the store
4560             if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
4561               StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4562                   null : this.prepareFlushResult.storeFlushCtxs.get(family);
4563               if (ctx != null) {
4564                 long snapshotSize = store.getFlushableSize();
4565                 ctx.abort();
4566                 this.addAndGetGlobalMemstoreSize(-snapshotSize);
4567                 this.prepareFlushResult.storeFlushCtxs.remove(family);
4568               }
4569             }
4570           }
4571 
4572           // Drop the memstore contents if they are now smaller than the latest seen flushed file
4573           dropMemstoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
4574           if (storeSeqId > this.maxFlushedSeqId) {
4575             this.maxFlushedSeqId = storeSeqId;
4576           }
4577         }
4578 
4579         // if all stores ended up dropping their snapshots, we can safely drop the
4580         // prepareFlushResult
4581         dropPrepareFlushIfPossible();
4582 
4583         // advance the mvcc read point so that the new flushed file is visible.
4584         // there may be some in-flight transactions, but they won't be made visible since they are
4585         // either greater than flush seq number or they were already dropped via flush.
4586         getMVCC().advanceMemstoreReadPointIfNeeded(this.maxFlushedSeqId);
4587 
4588         // If we were waiting for observing a flush or region opening event for not showing partial
4589         // data after a secondary region crash, we can allow reads now.
4590         this.setReadsEnabled(true);
4591 
4592         // C. Finally notify anyone waiting on memstore to clear:
4593         // e.g. checkResources().
4594         synchronized (this) {
4595           notifyAll(); // FindBugs NN_NAKED_NOTIFY
4596         }
4597       }
4598       logRegionFiles();
4599     } finally {
4600       closeRegionOperation(Operation.REPLAY_EVENT);
4601     }
4602   }
4603 
4604   void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
4605     checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
4606       "BulkLoad marker from WAL ", bulkLoadEvent);
4607 
4608     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4609       return; // if primary nothing to do
4610     }
4611 
4612     if (LOG.isDebugEnabled()) {
4613       LOG.debug(getRegionInfo().getEncodedName() + " : "
4614               +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
4615     }
4616     // check if multiple families involved
4617     boolean multipleFamilies = false;
4618     byte[] family = null;
4619     for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4620       byte[] fam = storeDescriptor.getFamilyName().toByteArray();
4621       if (family == null) {
4622         family = fam;
4623       } else if (!Bytes.equals(family, fam)) {
4624         multipleFamilies = true;
4625         break;
4626       }
4627     }
4628 
4629     startBulkRegionOperation(multipleFamilies);
4630     try {
4631       // we will use writestate as a coarse-grain lock for all the replay events
4632       synchronized (writestate) {
4633         // Replication can deliver events out of order when primary region moves or the region
4634         // server crashes, since there is no coordination between replication of different wal files
4635         // belonging to different region servers. We have to safe guard against this case by using
4636         // region open event's seqid. Since this is the first event that the region puts (after
4637         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4638         // smaller than this seqId
4639         if (bulkLoadEvent.getBulkloadSeqNum() >= 0
4640             && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
4641           LOG.warn(getRegionInfo().getEncodedName() + " : "
4642               + "Skipping replaying bulkload event :"
4643               + TextFormat.shortDebugString(bulkLoadEvent)
4644               + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
4645               + " =" + lastReplayedOpenRegionSeqId);
4646 
4647           return;
4648         }
4649 
4650         for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4651           // stores of primary may be different now
4652           family = storeDescriptor.getFamilyName().toByteArray();
4653           Store store = getStore(family);
4654           if (store == null) {
4655             LOG.warn(getRegionInfo().getEncodedName() + " : "
4656                     + "Received a bulk load marker from primary, but the family is not found. "
4657                     + "Ignoring. StoreDescriptor:" + storeDescriptor);
4658             continue;
4659           }
4660 
4661           List<String> storeFiles = storeDescriptor.getStoreFileList();
4662           for (String storeFile : storeFiles) {
4663             StoreFileInfo storeFileInfo = null;
4664             try {
4665               storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
4666               store.bulkLoadHFile(storeFileInfo);
4667             } catch(FileNotFoundException ex) {
4668               LOG.warn(getRegionInfo().getEncodedName() + " : "
4669                       + ((storeFileInfo != null) ? storeFileInfo.toString() :
4670                             (new Path(Bytes.toString(family), storeFile)).toString())
4671                       + " doesn't exist any more. Skip loading the file");
4672             }
4673           }
4674         }
4675       }
4676       if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
4677         getMVCC().advanceMemstoreReadPointIfNeeded(bulkLoadEvent.getBulkloadSeqNum());
4678       }
4679     } finally {
4680       closeBulkRegionOperation();
4681     }
4682   }
4683 
4684   /**
4685    * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
4686    */
4687   private void dropPrepareFlushIfPossible() {
4688     if (writestate.flushing) {
4689       boolean canDrop = true;
4690       if (prepareFlushResult.storeFlushCtxs != null) {
4691         for (Entry<byte[], StoreFlushContext> entry
4692             : prepareFlushResult.storeFlushCtxs.entrySet()) {
4693           Store store = getStore(entry.getKey());
4694           if (store == null) {
4695             continue;
4696           }
4697           if (store.getSnapshotSize() > 0) {
4698             canDrop = false;
4699             break;
4700           }
4701         }
4702       }
4703 
4704       // this means that all the stores in the region has finished flushing, but the WAL marker
4705       // may not have been written or we did not receive it yet.
4706       if (canDrop) {
4707         writestate.flushing = false;
4708         this.prepareFlushResult = null;
4709       }
4710     }
4711   }
4712 
4713   @Override
4714   public boolean refreshStoreFiles() throws IOException {
4715     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4716       return false; // if primary nothing to do
4717     }
4718 
4719     if (LOG.isDebugEnabled()) {
4720       LOG.debug(getRegionInfo().getEncodedName() + " : "
4721           + "Refreshing store files to see whether we can free up memstore");
4722     }
4723 
4724     long totalFreedSize = 0;
4725 
4726     long smallestSeqIdInStores = Long.MAX_VALUE;
4727 
4728     startRegionOperation(); // obtain region close lock
4729     try {
4730       synchronized (writestate) {
4731         for (Store store : getStores()) {
4732           // TODO: some stores might see new data from flush, while others do not which
4733           // MIGHT break atomic edits across column families.
4734           long maxSeqIdBefore = store.getMaxSequenceId();
4735 
4736           // refresh the store files. This is similar to observing a region open wal marker.
4737           store.refreshStoreFiles();
4738 
4739           long storeSeqId = store.getMaxSequenceId();
4740           if (storeSeqId < smallestSeqIdInStores) {
4741             smallestSeqIdInStores = storeSeqId;
4742           }
4743 
4744           // see whether we can drop the memstore or the snapshot
4745           if (storeSeqId > maxSeqIdBefore) {
4746 
4747             if (writestate.flushing) {
4748               // only drop memstore snapshots if they are smaller than last flush for the store
4749               if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
4750                 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4751                     null : this.prepareFlushResult.storeFlushCtxs.get(store.getFamily().getName());
4752                 if (ctx != null) {
4753                   long snapshotSize = store.getFlushableSize();
4754                   ctx.abort();
4755                   this.addAndGetGlobalMemstoreSize(-snapshotSize);
4756                   this.prepareFlushResult.storeFlushCtxs.remove(store.getFamily().getName());
4757                   totalFreedSize += snapshotSize;
4758                 }
4759               }
4760             }
4761 
4762             // Drop the memstore contents if they are now smaller than the latest seen flushed file
4763             totalFreedSize += dropMemstoreContentsForSeqId(storeSeqId, store);
4764           }
4765         }
4766 
4767         // if all stores ended up dropping their snapshots, we can safely drop the
4768         // prepareFlushResult
4769         dropPrepareFlushIfPossible();
4770 
4771         // advance the mvcc read point so that the new flushed files are visible.
4772         // there may be some in-flight transactions, but they won't be made visible since they are
4773         // either greater than flush seq number or they were already picked up via flush.
4774         for (Store s : getStores()) {
4775           getMVCC().advanceMemstoreReadPointIfNeeded(s.getMaxMemstoreTS());
4776         }
4777 
4778         // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
4779         // skip all edits that are to be replayed in the future with that has a smaller seqId
4780         // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
4781         // that we have picked the flush files for
4782         if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
4783           this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
4784         }
4785       }
4786       // C. Finally notify anyone waiting on memstore to clear:
4787       // e.g. checkResources().
4788       synchronized (this) {
4789         notifyAll(); // FindBugs NN_NAKED_NOTIFY
4790       }
4791       return totalFreedSize > 0;
4792     } finally {
4793       closeRegionOperation();
4794     }
4795   }
4796 
4797   private void logRegionFiles() {
4798     if (LOG.isTraceEnabled()) {
4799       LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
4800       for (Store s : stores.values()) {
4801         for (StoreFile sf : s.getStorefiles()) {
4802           LOG.trace(getRegionInfo().getEncodedName() + " : " + sf);
4803         }
4804       }
4805     }
4806   }
4807 
4808   /** Checks whether the given regionName is either equal to our region, or that
4809    * the regionName is the primary region to our corresponding range for the secondary replica.
4810    */
4811   private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
4812       throws WrongRegionException {
4813     if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
4814       return;
4815     }
4816 
4817     if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
4818         Bytes.equals(encodedRegionName,
4819           this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
4820       return;
4821     }
4822 
4823     throw new WrongRegionException(exceptionMsg + payload
4824       + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
4825       + " does not match this region: " + this.getRegionInfo());
4826   }
4827 
4828   /**
4829    * Used by tests
4830    * @param s Store to add edit too.
4831    * @param cell Cell to add.
4832    * @return True if we should flush.
4833    */
4834   protected boolean restoreEdit(final Store s, final Cell cell) {
4835     long kvSize = s.add(cell).getFirst();
4836     if (this.rsAccounting != null) {
4837       rsAccounting.addAndGetRegionReplayEditsSize(getRegionInfo().getRegionName(), kvSize);
4838     }
4839     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
4840   }
4841 
4842   /*
4843    * @param fs
4844    * @param p File to check.
4845    * @return True if file was zero-length (and if so, we'll delete it in here).
4846    * @throws IOException
4847    */
4848   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
4849       throws IOException {
4850     FileStatus stat = fs.getFileStatus(p);
4851     if (stat.getLen() > 0) return false;
4852     LOG.warn("File " + p + " is zero-length, deleting.");
4853     fs.delete(p, false);
4854     return true;
4855   }
4856 
4857   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
4858     return new HStore(this, family, this.conf);
4859   }
4860 
4861   @Override
4862   public Store getStore(final byte[] column) {
4863     return this.stores.get(column);
4864   }
4865 
4866   /**
4867    * Return HStore instance. Does not do any copy: as the number of store is limited, we
4868    *  iterate on the list.
4869    */
4870   private Store getStore(Cell cell) {
4871     for (Map.Entry<byte[], Store> famStore : stores.entrySet()) {
4872       if (Bytes.equals(
4873           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
4874           famStore.getKey(), 0, famStore.getKey().length)) {
4875         return famStore.getValue();
4876       }
4877     }
4878 
4879     return null;
4880   }
4881 
4882   @Override
4883   public List<Store> getStores() {
4884     List<Store> list = new ArrayList<Store>(stores.size());
4885     list.addAll(stores.values());
4886     return list;
4887   }
4888 
4889   @Override
4890   public List<String> getStoreFileList(final byte [][] columns)
4891     throws IllegalArgumentException {
4892     List<String> storeFileNames = new ArrayList<String>();
4893     synchronized(closeLock) {
4894       for(byte[] column : columns) {
4895         Store store = this.stores.get(column);
4896         if (store == null) {
4897           throw new IllegalArgumentException("No column family : " +
4898               new String(column) + " available");
4899         }
4900         for (StoreFile storeFile: store.getStorefiles()) {
4901           storeFileNames.add(storeFile.getPath().toString());
4902         }
4903 
4904         logRegionFiles();
4905       }
4906     }
4907     return storeFileNames;
4908   }
4909 
4910   //////////////////////////////////////////////////////////////////////////////
4911   // Support code
4912   //////////////////////////////////////////////////////////////////////////////
4913 
4914   /** Make sure this is a valid row for the HRegion */
4915   void checkRow(final byte [] row, String op) throws IOException {
4916     if (!rowIsInRange(getRegionInfo(), row)) {
4917       throw new WrongRegionException("Requested row out of range for " +
4918           op + " on HRegion " + this + ", startKey='" +
4919           Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" +
4920           Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" +
4921           Bytes.toStringBinary(row) + "'");
4922     }
4923   }
4924 
4925   @Override
4926   public RowLock getRowLock(byte[] row, boolean waitForLock) throws IOException {
4927     startRegionOperation();
4928     try {
4929       return getRowLockInternal(row, waitForLock);
4930     } finally {
4931       closeRegionOperation();
4932     }
4933   }
4934 
4935   /**
4936    * A version of getRowLock(byte[], boolean) to use when a region operation has already been
4937    * started (the calling thread has already acquired the region-close-guard lock).
4938    */
4939   protected RowLock getRowLockInternal(byte[] row, boolean waitForLock) throws IOException {
4940     HashedBytes rowKey = new HashedBytes(row);
4941     RowLockContext rowLockContext = new RowLockContext(rowKey);
4942 
4943     // loop until we acquire the row lock (unless !waitForLock)
4944     while (true) {
4945       RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
4946       if (existingContext == null) {
4947         // Row is not already locked by any thread, use newly created context.
4948         break;
4949       } else if (existingContext.ownedByCurrentThread()) {
4950         // Row is already locked by current thread, reuse existing context instead.
4951         rowLockContext = existingContext;
4952         break;
4953       } else {
4954         if (!waitForLock) {
4955           return null;
4956         }
4957         TraceScope traceScope = null;
4958         try {
4959           if (Trace.isTracing()) {
4960             traceScope = Trace.startSpan("HRegion.getRowLockInternal");
4961           }
4962           // Row is already locked by some other thread, give up or wait for it
4963           if (!existingContext.latch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
4964             if(traceScope != null) {
4965               traceScope.getSpan().addTimelineAnnotation("Failed to get row lock");
4966             }
4967             throw new IOException("Timed out waiting for lock for row: " + rowKey);
4968           }
4969           if (traceScope != null) traceScope.close();
4970           traceScope = null;
4971         } catch (InterruptedException ie) {
4972           LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
4973           InterruptedIOException iie = new InterruptedIOException();
4974           iie.initCause(ie);
4975           throw iie;
4976         } finally {
4977           if (traceScope != null) traceScope.close();
4978         }
4979       }
4980     }
4981 
4982     // allocate new lock for this thread
4983     return rowLockContext.newLock();
4984   }
4985 
4986   /**
4987    * Acquires a lock on the given row.
4988    * The same thread may acquire multiple locks on the same row.
4989    * @return the acquired row lock
4990    * @throws IOException if the lock could not be acquired after waiting
4991    */
4992   public RowLock getRowLock(byte[] row) throws IOException {
4993     return getRowLock(row, true);
4994   }
4995 
4996   @Override
4997   public void releaseRowLocks(List<RowLock> rowLocks) {
4998     if (rowLocks != null) {
4999       for (RowLock rowLock : rowLocks) {
5000         rowLock.release();
5001       }
5002       rowLocks.clear();
5003     }
5004   }
5005 
5006   /**
5007    * Determines whether multiple column families are present
5008    * Precondition: familyPaths is not null
5009    *
5010    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
5011    */
5012   private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) {
5013     boolean multipleFamilies = false;
5014     byte[] family = null;
5015     for (Pair<byte[], String> pair : familyPaths) {
5016       byte[] fam = pair.getFirst();
5017       if (family == null) {
5018         family = fam;
5019       } else if (!Bytes.equals(family, fam)) {
5020         multipleFamilies = true;
5021         break;
5022       }
5023     }
5024     return multipleFamilies;
5025   }
5026 
5027   @Override
5028   public boolean bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, boolean assignSeqId,
5029       BulkLoadListener bulkLoadListener) throws IOException {
5030     long seqId = -1;
5031     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
5032     Preconditions.checkNotNull(familyPaths);
5033     // we need writeLock for multi-family bulk load
5034     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
5035     try {
5036       this.writeRequestsCount.increment();
5037 
5038       // There possibly was a split that happened between when the split keys
5039       // were gathered and before the HRegion's write lock was taken.  We need
5040       // to validate the HFile region before attempting to bulk load all of them
5041       List<IOException> ioes = new ArrayList<IOException>();
5042       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
5043       for (Pair<byte[], String> p : familyPaths) {
5044         byte[] familyName = p.getFirst();
5045         String path = p.getSecond();
5046 
5047         Store store = getStore(familyName);
5048         if (store == null) {
5049           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
5050               "No such column family " + Bytes.toStringBinary(familyName));
5051           ioes.add(ioe);
5052         } else {
5053           try {
5054             store.assertBulkLoadHFileOk(new Path(path));
5055           } catch (WrongRegionException wre) {
5056             // recoverable (file doesn't fit in region)
5057             failures.add(p);
5058           } catch (IOException ioe) {
5059             // unrecoverable (hdfs problem)
5060             ioes.add(ioe);
5061           }
5062         }
5063       }
5064 
5065       // validation failed because of some sort of IO problem.
5066       if (ioes.size() != 0) {
5067         IOException e = MultipleIOException.createIOException(ioes);
5068         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
5069         throw e;
5070       }
5071 
5072       // validation failed, bail out before doing anything permanent.
5073       if (failures.size() != 0) {
5074         StringBuilder list = new StringBuilder();
5075         for (Pair<byte[], String> p : failures) {
5076           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
5077               .append(p.getSecond());
5078         }
5079         // problem when validating
5080         LOG.warn("There was a recoverable bulk load failure likely due to a" +
5081             " split.  These (family, HFile) pairs were not loaded: " + list);
5082         return false;
5083       }
5084 
5085       // We need to assign a sequential ID that's in between two memstores in order to preserve
5086       // the guarantee that all the edits lower than the highest sequential ID from all the
5087       // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
5088       // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
5089       // a sequence id that we can be sure is beyond the last hfile written).
5090       if (assignSeqId) {
5091         FlushResult fs = flushcache(true, false);
5092         if (fs.isFlushSucceeded()) {
5093           seqId = ((FlushResultImpl)fs).flushSequenceId;
5094         } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
5095           seqId = ((FlushResultImpl)fs).flushSequenceId;
5096         } else {
5097           throw new IOException("Could not bulk load with an assigned sequential ID because the "+
5098             "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason);
5099         }
5100       }
5101 
5102       for (Pair<byte[], String> p : familyPaths) {
5103         byte[] familyName = p.getFirst();
5104         String path = p.getSecond();
5105         Store store = getStore(familyName);
5106         try {
5107           String finalPath = path;
5108           if (bulkLoadListener != null) {
5109             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
5110           }
5111           Path commitedStoreFile = store.bulkLoadHFile(finalPath, seqId);
5112 
5113           if(storeFiles.containsKey(familyName)) {
5114             storeFiles.get(familyName).add(commitedStoreFile);
5115           } else {
5116             List<Path> storeFileNames = new ArrayList<Path>();
5117             storeFileNames.add(commitedStoreFile);
5118             storeFiles.put(familyName, storeFileNames);
5119           }
5120           if (bulkLoadListener != null) {
5121             bulkLoadListener.doneBulkLoad(familyName, path);
5122           }
5123         } catch (IOException ioe) {
5124           // A failure here can cause an atomicity violation that we currently
5125           // cannot recover from since it is likely a failed HDFS operation.
5126 
5127           // TODO Need a better story for reverting partial failures due to HDFS.
5128           LOG.error("There was a partial failure due to IO when attempting to" +
5129               " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
5130           if (bulkLoadListener != null) {
5131             try {
5132               bulkLoadListener.failedBulkLoad(familyName, path);
5133             } catch (Exception ex) {
5134               LOG.error("Error while calling failedBulkLoad for family " +
5135                   Bytes.toString(familyName) + " with path " + path, ex);
5136             }
5137           }
5138           throw ioe;
5139         }
5140       }
5141 
5142       return true;
5143     } finally {
5144       if (wal != null && !storeFiles.isEmpty()) {
5145         // write a bulk load event when not all hfiles are loaded
5146         try {
5147           WALProtos.BulkLoadDescriptor loadDescriptor = ProtobufUtil.toBulkLoadDescriptor(
5148               this.getRegionInfo().getTable(),
5149               ByteStringer.wrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles, seqId);
5150           WALUtil.writeBulkLoadMarkerAndSync(wal, this.htableDescriptor, getRegionInfo(),
5151               loadDescriptor, sequenceId);
5152         } catch (IOException ioe) {
5153           if (this.rsServices != null) {
5154             // Have to abort region server because some hfiles has been loaded but we can't write
5155             // the event into WAL
5156             this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
5157           }
5158         }
5159       }
5160 
5161       closeBulkRegionOperation();
5162     }
5163   }
5164 
5165   @Override
5166   public boolean equals(Object o) {
5167     return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
5168                                                 ((HRegion) o).getRegionInfo().getRegionName());
5169   }
5170 
5171   @Override
5172   public int hashCode() {
5173     return Bytes.hashCode(getRegionInfo().getRegionName());
5174   }
5175 
5176   @Override
5177   public String toString() {
5178     return getRegionInfo().getRegionNameAsString();
5179   }
5180 
5181   /**
5182    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
5183    */
5184   class RegionScannerImpl implements RegionScanner {
5185     // Package local for testability
5186     KeyValueHeap storeHeap = null;
5187     /** Heap of key-values that are not essential for the provided filters and are thus read
5188      * on demand, if on-demand column family loading is enabled.*/
5189     KeyValueHeap joinedHeap = null;
5190     /**
5191      * If the joined heap data gathering is interrupted due to scan limits, this will
5192      * contain the row for which we are populating the values.*/
5193     protected Cell joinedContinuationRow = null;
5194     protected final byte[] stopRow;
5195     private final FilterWrapper filter;
5196     private ScannerContext defaultScannerContext;
5197     protected int isScan;
5198     private boolean filterClosed = false;
5199     private long readPt;
5200     private long maxResultSize;
5201     protected HRegion region;
5202     protected KVComparator comparator;
5203 
5204     @Override
5205     public HRegionInfo getRegionInfo() {
5206       return region.getRegionInfo();
5207     }
5208 
5209     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
5210         throws IOException {
5211 
5212       this.region = region;
5213       this.maxResultSize = scan.getMaxResultSize();
5214       if (scan.hasFilter()) {
5215         this.filter = new FilterWrapper(scan.getFilter());
5216       } else {
5217         this.filter = null;
5218       }
5219       this.comparator = region.getCellCompartor();
5220 
5221       /**
5222        * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default
5223        * scanner context that can be used to enforce the batch limit in the event that a
5224        * ScannerContext is not specified during an invocation of next/nextRaw
5225        */
5226       defaultScannerContext = ScannerContext.newBuilder().setBatchLimit(scan.getBatch()).build();
5227 
5228       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
5229         this.stopRow = null;
5230       } else {
5231         this.stopRow = scan.getStopRow();
5232       }
5233       // If we are doing a get, we want to be [startRow,endRow] normally
5234       // it is [startRow,endRow) and if startRow=endRow we get nothing.
5235       this.isScan = scan.isGetScan() ? -1 : 0;
5236 
5237       // synchronize on scannerReadPoints so that nobody calculates
5238       // getSmallestReadPoint, before scannerReadPoints is updated.
5239       IsolationLevel isolationLevel = scan.getIsolationLevel();
5240       synchronized(scannerReadPoints) {
5241         this.readPt = getReadpoint(isolationLevel);
5242         scannerReadPoints.put(this, this.readPt);
5243       }
5244 
5245       // Here we separate all scanners into two lists - scanner that provide data required
5246       // by the filter to operate (scanners list) and all others (joinedScanners list).
5247       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>();
5248       List<KeyValueScanner> joinedScanners = new ArrayList<KeyValueScanner>();
5249       if (additionalScanners != null) {
5250         scanners.addAll(additionalScanners);
5251       }
5252 
5253       for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
5254           scan.getFamilyMap().entrySet()) {
5255         Store store = stores.get(entry.getKey());
5256         KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt);
5257         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
5258           || this.filter.isFamilyEssential(entry.getKey())) {
5259           scanners.add(scanner);
5260         } else {
5261           joinedScanners.add(scanner);
5262         }
5263       }
5264       initializeKVHeap(scanners, joinedScanners, region);
5265     }
5266 
5267     protected void initializeKVHeap(List<KeyValueScanner> scanners,
5268         List<KeyValueScanner> joinedScanners, HRegion region)
5269         throws IOException {
5270       this.storeHeap = new KeyValueHeap(scanners, comparator);
5271       if (!joinedScanners.isEmpty()) {
5272         this.joinedHeap = new KeyValueHeap(joinedScanners, comparator);
5273       }
5274     }
5275 
5276     @Override
5277     public long getMaxResultSize() {
5278       return maxResultSize;
5279     }
5280 
5281     @Override
5282     public long getMvccReadPoint() {
5283       return this.readPt;
5284     }
5285 
5286     @Override
5287     public int getBatch() {
5288       return this.defaultScannerContext.getBatchLimit();
5289     }
5290 
5291     /**
5292      * Reset both the filter and the old filter.
5293      *
5294      * @throws IOException in case a filter raises an I/O exception.
5295      */
5296     protected void resetFilters() throws IOException {
5297       if (filter != null) {
5298         filter.reset();
5299       }
5300     }
5301 
5302     @Override
5303     public boolean next(List<Cell> outResults)
5304         throws IOException {
5305       // apply the batching limit by default
5306       return next(outResults, defaultScannerContext);
5307     }
5308 
5309     @Override
5310     public synchronized boolean next(List<Cell> outResults, ScannerContext scannerContext) throws IOException {
5311       if (this.filterClosed) {
5312         throw new UnknownScannerException("Scanner was closed (timed out?) " +
5313             "after we renewed it. Could be caused by a very slow scanner " +
5314             "or a lengthy garbage collection");
5315       }
5316       startRegionOperation(Operation.SCAN);
5317       readRequestsCount.increment();
5318       try {
5319         return nextRaw(outResults, scannerContext);
5320       } finally {
5321         closeRegionOperation(Operation.SCAN);
5322       }
5323     }
5324 
5325     @Override
5326     public boolean nextRaw(List<Cell> outResults) throws IOException {
5327       // Use the RegionScanner's context by default
5328       return nextRaw(outResults, defaultScannerContext);
5329     }
5330 
5331     @Override
5332     public boolean nextRaw(List<Cell> outResults, ScannerContext scannerContext)
5333         throws IOException {
5334       if (storeHeap == null) {
5335         // scanner is closed
5336         throw new UnknownScannerException("Scanner was closed");
5337       }
5338       boolean moreValues;
5339       if (outResults.isEmpty()) {
5340         // Usually outResults is empty. This is true when next is called
5341         // to handle scan or get operation.
5342         moreValues = nextInternal(outResults, scannerContext);
5343       } else {
5344         List<Cell> tmpList = new ArrayList<Cell>();
5345         moreValues = nextInternal(tmpList, scannerContext);
5346         outResults.addAll(tmpList);
5347       }
5348       
5349       // If the size limit was reached it means a partial Result is being returned. Returning a
5350       // partial Result means that we should not reset the filters; filters should only be reset in
5351       // between rows
5352       if (!scannerContext.partialResultFormed()) resetFilters();
5353 
5354       if (isFilterDoneInternal()) {
5355         moreValues = false;
5356       }
5357       return moreValues;
5358     }
5359 
5360     /**
5361      * @return true if more cells exist after this batch, false if scanner is done
5362      */
5363     private boolean populateFromJoinedHeap(List<Cell> results, ScannerContext scannerContext)
5364             throws IOException {
5365       assert joinedContinuationRow != null;
5366       boolean moreValues =
5367           populateResult(results, this.joinedHeap, scannerContext,
5368           joinedContinuationRow.getRowArray(), joinedContinuationRow.getRowOffset(),
5369           joinedContinuationRow.getRowLength());
5370 
5371       if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5372         // We are done with this row, reset the continuation.
5373         joinedContinuationRow = null;
5374       }
5375       // As the data is obtained from two independent heaps, we need to
5376       // ensure that result list is sorted, because Result relies on that.
5377       Collections.sort(results, comparator);
5378       return moreValues;
5379     }
5380 
5381     /**
5382      * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
5383      * reached, or remainingResultSize (if not -1) is reaced
5384      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
5385      * @param scannerContext
5386      * @param currentRow Byte array with key we are fetching.
5387      * @param offset offset for currentRow
5388      * @param length length for currentRow
5389      * @return state of last call to {@link KeyValueHeap#next()}
5390      */
5391     private boolean populateResult(List<Cell> results, KeyValueHeap heap,
5392         ScannerContext scannerContext, byte[] currentRow, int offset, short length)
5393         throws IOException {
5394       Cell nextKv;
5395       boolean moreCellsInRow = false;
5396       boolean tmpKeepProgress = scannerContext.getKeepProgress();
5397       // Scanning between column families and thus the scope is between cells
5398       LimitScope limitScope = LimitScope.BETWEEN_CELLS;
5399       do {
5400         // We want to maintain any progress that is made towards the limits while scanning across
5401         // different column families. To do this, we toggle the keep progress flag on during calls
5402         // to the StoreScanner to ensure that any progress made thus far is not wiped away.
5403         scannerContext.setKeepProgress(true);
5404         heap.next(results, scannerContext);
5405         scannerContext.setKeepProgress(tmpKeepProgress);
5406 
5407         nextKv = heap.peek();
5408         moreCellsInRow = moreCellsInRow(nextKv, currentRow, offset, length);
5409 
5410         if (scannerContext.checkBatchLimit(limitScope)) {
5411           return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues();
5412         } else if (scannerContext.checkSizeLimit(limitScope)) {
5413           ScannerContext.NextState state =
5414               moreCellsInRow ? NextState.SIZE_LIMIT_REACHED_MID_ROW : NextState.SIZE_LIMIT_REACHED;
5415           return scannerContext.setScannerState(state).hasMoreValues();
5416         } else if (scannerContext.checkTimeLimit(limitScope)) {
5417           ScannerContext.NextState state =
5418               moreCellsInRow ? NextState.TIME_LIMIT_REACHED_MID_ROW : NextState.TIME_LIMIT_REACHED;
5419           return scannerContext.setScannerState(state).hasMoreValues();
5420         }
5421       } while (moreCellsInRow);
5422 
5423       return nextKv != null;
5424     }
5425 
5426     /**
5427      * Based on the nextKv in the heap, and the current row, decide whether or not there are more
5428      * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
5429      * then there are more cells to be read in the row.
5430      * @param nextKv
5431      * @param currentRow
5432      * @param offset
5433      * @param length
5434      * @return true When there are more cells in the row to be read
5435      */
5436     private boolean moreCellsInRow(final Cell nextKv, byte[] currentRow, int offset,
5437         short length) {
5438       return nextKv != null && CellUtil.matchingRow(nextKv, currentRow, offset, length);
5439     }
5440 
5441     /*
5442      * @return True if a filter rules the scanner is over, done.
5443      */
5444     @Override
5445     public synchronized boolean isFilterDone() throws IOException {
5446       return isFilterDoneInternal();
5447     }
5448 
5449     private boolean isFilterDoneInternal() throws IOException {
5450       return this.filter != null && this.filter.filterAllRemaining();
5451     }
5452 
5453     private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
5454         throws IOException {
5455       if (!results.isEmpty()) {
5456         throw new IllegalArgumentException("First parameter should be an empty list");
5457       }
5458       if (scannerContext == null) {
5459         throw new IllegalArgumentException("Scanner context cannot be null");
5460       }
5461       RpcCallContext rpcCall = RpcServer.getCurrentCall();
5462 
5463       // Save the initial progress from the Scanner context in these local variables. The progress
5464       // may need to be reset a few times if rows are being filtered out so we save the initial
5465       // progress.
5466       int initialBatchProgress = scannerContext.getBatchProgress();
5467       long initialSizeProgress = scannerContext.getSizeProgress();
5468       long initialTimeProgress = scannerContext.getTimeProgress();
5469 
5470       // The loop here is used only when at some point during the next we determine
5471       // that due to effects of filters or otherwise, we have an empty row in the result.
5472       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
5473       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
5474       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
5475       while (true) {
5476         // Starting to scan a new row. Reset the scanner progress according to whether or not
5477         // progress should be kept.
5478         if (scannerContext.getKeepProgress()) {
5479           // Progress should be kept. Reset to initial values seen at start of method invocation.
5480           scannerContext
5481               .setProgress(initialBatchProgress, initialSizeProgress, initialTimeProgress);
5482         } else {
5483           scannerContext.clearProgress();
5484         }
5485 
5486         if (rpcCall != null) {
5487           // If a user specifies a too-restrictive or too-slow scanner, the
5488           // client might time out and disconnect while the server side
5489           // is still processing the request. We should abort aggressively
5490           // in that case.
5491           long afterTime = rpcCall.disconnectSince();
5492           if (afterTime >= 0) {
5493             throw new CallerDisconnectedException(
5494                 "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " +
5495                     this + " after " + afterTime + " ms, since " +
5496                     "caller disconnected");
5497           }
5498         }
5499 
5500         // Let's see what we have in the storeHeap.
5501         Cell current = this.storeHeap.peek();
5502 
5503         byte[] currentRow = null;
5504         int offset = 0;
5505         short length = 0;
5506         if (current != null) {
5507           currentRow = current.getRowArray();
5508           offset = current.getRowOffset();
5509           length = current.getRowLength();
5510         }
5511 
5512         boolean stopRow = isStopRow(currentRow, offset, length);
5513         // When has filter row is true it means that the all the cells for a particular row must be
5514         // read before a filtering decision can be made. This means that filters where hasFilterRow
5515         // run the risk of encountering out of memory errors in the case that they are applied to a
5516         // table that has very large rows.
5517         boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();
5518 
5519         // If filter#hasFilterRow is true, partial results are not allowed since allowing them
5520         // would prevent the filters from being evaluated. Thus, if it is true, change the
5521         // scope of any limits that could potentially create partial results to
5522         // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row
5523         if (hasFilterRow) {
5524           if (LOG.isTraceEnabled()) {
5525             LOG.trace("filter#hasFilterRow is true which prevents partial results from being "
5526                 + " formed. Changing scope of limits that may create partials");
5527           }
5528           scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS);
5529           scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS);
5530         }
5531 
5532         // Check if we were getting data from the joinedHeap and hit the limit.
5533         // If not, then it's main path - getting results from storeHeap.
5534         if (joinedContinuationRow == null) {
5535           // First, check if we are at a stop row. If so, there are no more results.
5536           if (stopRow) {
5537             if (hasFilterRow) {
5538               filter.filterRowCells(results);
5539             }
5540             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5541           }
5542 
5543           // Check if rowkey filter wants to exclude this row. If so, loop to next.
5544           // Technically, if we hit limits before on this row, we don't need this call.
5545           if (filterRowKey(currentRow, offset, length)) {
5546             boolean moreRows = nextRow(currentRow, offset, length);
5547             if (!moreRows) {
5548               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5549             }
5550             results.clear();
5551             continue;
5552           }
5553 
5554           // Ok, we are good, let's try to get some results from the main heap.
5555           populateResult(results, this.storeHeap, scannerContext, currentRow, offset, length);
5556 
5557           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5558             if (hasFilterRow) {
5559               throw new IncompatibleFilterException(
5560                   "Filter whose hasFilterRow() returns true is incompatible with scans that must "
5561                       + " stop mid-row because of a limit. ScannerContext:" + scannerContext);
5562             }
5563             return true;
5564           }
5565 
5566           Cell nextKv = this.storeHeap.peek();
5567           stopRow = nextKv == null ||
5568               isStopRow(nextKv.getRowArray(), nextKv.getRowOffset(), nextKv.getRowLength());
5569           // save that the row was empty before filters applied to it.
5570           final boolean isEmptyRow = results.isEmpty();
5571 
5572           // We have the part of the row necessary for filtering (all of it, usually).
5573           // First filter with the filterRow(List).
5574           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
5575           if (hasFilterRow) {
5576             ret = filter.filterRowCellsWithRet(results);
5577 
5578             // We don't know how the results have changed after being filtered. Must set progress
5579             // according to contents of results now. However, a change in the results should not
5580             // affect the time progress. Thus preserve whatever time progress has been made
5581             long timeProgress = scannerContext.getTimeProgress();
5582             if (scannerContext.getKeepProgress()) {
5583               scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
5584                 initialTimeProgress);
5585             } else {
5586               scannerContext.clearProgress();
5587             }
5588             scannerContext.setTimeProgress(timeProgress);
5589             scannerContext.incrementBatchProgress(results.size());
5590             for (Cell cell : results) {
5591               scannerContext.incrementSizeProgress(CellUtil.estimatedHeapSizeOfWithoutTags(cell));
5592             }
5593           }
5594 
5595           if ((isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE) || filterRow()) {
5596             results.clear();
5597             boolean moreRows = nextRow(currentRow, offset, length);
5598             if (!moreRows) {
5599               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5600             }
5601 
5602             // This row was totally filtered out, if this is NOT the last row,
5603             // we should continue on. Otherwise, nothing else to do.
5604             if (!stopRow) continue;
5605             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5606           }
5607 
5608           // Ok, we are done with storeHeap for this row.
5609           // Now we may need to fetch additional, non-essential data into row.
5610           // These values are not needed for filter to work, so we postpone their
5611           // fetch to (possibly) reduce amount of data loads from disk.
5612           if (this.joinedHeap != null) {
5613             boolean mayHaveData = joinedHeapMayHaveData(currentRow, offset, length);
5614             if (mayHaveData) {
5615               joinedContinuationRow = current;
5616               populateFromJoinedHeap(results, scannerContext);
5617 
5618               if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5619                 return true;
5620               }
5621             }
5622           }
5623         } else {
5624           // Populating from the joined heap was stopped by limits, populate some more.
5625           populateFromJoinedHeap(results, scannerContext);
5626           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5627             return true;
5628           }
5629         }
5630         // We may have just called populateFromJoinedMap and hit the limits. If that is
5631         // the case, we need to call it again on the next next() invocation.
5632         if (joinedContinuationRow != null) {
5633           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
5634         }
5635 
5636         // Finally, we are done with both joinedHeap and storeHeap.
5637         // Double check to prevent empty rows from appearing in result. It could be
5638         // the case when SingleColumnValueExcludeFilter is used.
5639         if (results.isEmpty()) {
5640           boolean moreRows = nextRow(currentRow, offset, length);
5641           if (!moreRows) {
5642             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5643           }
5644           if (!stopRow) continue;
5645         }
5646 
5647         // We are done. Return the result.
5648         if (stopRow) {
5649           return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5650         } else {
5651           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
5652         }
5653       }
5654     }
5655 
5656     /**
5657      * @param currentRow
5658      * @param offset
5659      * @param length
5660      * @return true when the joined heap may have data for the current row
5661      * @throws IOException
5662      */
5663     private boolean joinedHeapMayHaveData(byte[] currentRow, int offset, short length)
5664         throws IOException {
5665       Cell nextJoinedKv = joinedHeap.peek();
5666       boolean matchCurrentRow =
5667           nextJoinedKv != null && CellUtil.matchingRow(nextJoinedKv, currentRow, offset, length);
5668       boolean matchAfterSeek = false;
5669 
5670       // If the next value in the joined heap does not match the current row, try to seek to the
5671       // correct row
5672       if (!matchCurrentRow) {
5673         Cell firstOnCurrentRow = KeyValueUtil.createFirstOnRow(currentRow, offset, length);
5674         boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true);
5675         matchAfterSeek =
5676             seekSuccessful && joinedHeap.peek() != null
5677                 && CellUtil.matchingRow(joinedHeap.peek(), currentRow, offset, length);
5678       }
5679 
5680       return matchCurrentRow || matchAfterSeek;
5681     }
5682 
5683     /**
5684      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
5685      * both filterRow & filterRow(List<KeyValue> kvs) functions. While 0.94 code or older, it may
5686      * not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only returns
5687      * true when filterRow(List<KeyValue> kvs) is overridden not the filterRow(). Therefore, the
5688      * filterRow() will be skipped.
5689      */
5690     private boolean filterRow() throws IOException {
5691       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
5692       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
5693       return filter != null && (!filter.hasFilterRow())
5694           && filter.filterRow();
5695     }
5696 
5697     private boolean filterRowKey(byte[] row, int offset, short length) throws IOException {
5698       return filter != null
5699           && filter.filterRowKey(row, offset, length);
5700     }
5701 
5702     protected boolean nextRow(byte [] currentRow, int offset, short length) throws IOException {
5703       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
5704       Cell next;
5705       while ((next = this.storeHeap.peek()) != null &&
5706              CellUtil.matchingRow(next, currentRow, offset, length)) {
5707         this.storeHeap.next(MOCKED_LIST);
5708       }
5709       resetFilters();
5710       // Calling the hook in CP which allows it to do a fast forward
5711       return this.region.getCoprocessorHost() == null
5712           || this.region.getCoprocessorHost()
5713               .postScannerFilterRow(this, currentRow, offset, length);
5714     }
5715 
5716     protected boolean isStopRow(byte[] currentRow, int offset, short length) {
5717       return currentRow == null ||
5718           (stopRow != null &&
5719           comparator.compareRows(stopRow, 0, stopRow.length,
5720             currentRow, offset, length) <= isScan);
5721     }
5722 
5723     @Override
5724     public synchronized void close() {
5725       if (storeHeap != null) {
5726         storeHeap.close();
5727         storeHeap = null;
5728       }
5729       if (joinedHeap != null) {
5730         joinedHeap.close();
5731         joinedHeap = null;
5732       }
5733       // no need to synchronize here.
5734       scannerReadPoints.remove(this);
5735       this.filterClosed = true;
5736     }
5737 
5738     KeyValueHeap getStoreHeapForTesting() {
5739       return storeHeap;
5740     }
5741 
5742     @Override
5743     public synchronized boolean reseek(byte[] row) throws IOException {
5744       if (row == null) {
5745         throw new IllegalArgumentException("Row cannot be null.");
5746       }
5747       boolean result = false;
5748       startRegionOperation();
5749       try {
5750         KeyValue kv = KeyValueUtil.createFirstOnRow(row);
5751         // use request seek to make use of the lazy seek option. See HBASE-5520
5752         result = this.storeHeap.requestSeek(kv, true, true);
5753         if (this.joinedHeap != null) {
5754           result = this.joinedHeap.requestSeek(kv, true, true) || result;
5755         }
5756       } finally {
5757         closeRegionOperation();
5758       }
5759       return result;
5760     }
5761   }
5762 
5763   // Utility methods
5764   /**
5765    * A utility method to create new instances of HRegion based on the
5766    * {@link HConstants#REGION_IMPL} configuration property.
5767    * @param tableDir qualified path of directory where region should be located,
5768    * usually the table directory.
5769    * @param wal The WAL is the outbound log for any updates to the HRegion
5770    * The wal file is a logfile from the previous execution that's
5771    * custom-computed for this HRegion. The HRegionServer computes and sorts the
5772    * appropriate wal info for this HRegion. If there is a previous file
5773    * (implying that the HRegion has been written-to before), then read it from
5774    * the supplied path.
5775    * @param fs is the filesystem.
5776    * @param conf is global configuration settings.
5777    * @param regionInfo - HRegionInfo that describes the region
5778    * is new), then read them from the supplied path.
5779    * @param htd the table descriptor
5780    * @return the new instance
5781    */
5782   static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
5783       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
5784       RegionServerServices rsServices) {
5785     try {
5786       @SuppressWarnings("unchecked")
5787       Class<? extends HRegion> regionClass =
5788           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
5789 
5790       Constructor<? extends HRegion> c =
5791           regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
5792               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
5793               RegionServerServices.class);
5794 
5795       return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
5796     } catch (Throwable e) {
5797       // todo: what should I throw here?
5798       throw new IllegalStateException("Could not instantiate a region instance.", e);
5799     }
5800   }
5801 
5802   /**
5803    * Convenience method creating new HRegions. Used by createTable.
5804    *
5805    * @param info Info for region to create.
5806    * @param rootDir Root directory for HBase instance
5807    * @param wal shared WAL
5808    * @param initialize - true to initialize the region
5809    * @return new HRegion
5810    * @throws IOException
5811    */
5812   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
5813                                       final Configuration conf,
5814                                       final HTableDescriptor hTableDescriptor,
5815                                       final WAL wal,
5816                                       final boolean initialize)
5817       throws IOException {
5818     LOG.info("creating HRegion " + info.getTable().getNameAsString()
5819         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
5820         " Table name == " + info.getTable().getNameAsString());
5821     FileSystem fs = FileSystem.get(conf);
5822     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
5823     HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
5824     HRegion region = HRegion.newHRegion(tableDir,
5825         wal, fs, conf, info, hTableDescriptor, null);
5826     if (initialize) {
5827       // If initializing, set the sequenceId. It is also required by WALPerformanceEvaluation when
5828       // verifying the WALEdits.
5829       region.setSequenceId(region.initialize(null));
5830     }
5831     return region;
5832   }
5833 
5834   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
5835                                       final Configuration conf,
5836                                       final HTableDescriptor hTableDescriptor,
5837                                       final WAL wal)
5838     throws IOException {
5839     return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
5840   }
5841 
5842 
5843   /**
5844    * Open a Region.
5845    * @param info Info for region to be opened.
5846    * @param wal WAL for region to use. This method will call
5847    * WAL#setSequenceNumber(long) passing the result of the call to
5848    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
5849    * up.  HRegionStore does this every time it opens a new region.
5850    * @return new HRegion
5851    *
5852    * @throws IOException
5853    */
5854   public static HRegion openHRegion(final HRegionInfo info,
5855       final HTableDescriptor htd, final WAL wal,
5856       final Configuration conf)
5857   throws IOException {
5858     return openHRegion(info, htd, wal, conf, null, null);
5859   }
5860 
5861   /**
5862    * Open a Region.
5863    * @param info Info for region to be opened
5864    * @param htd the table descriptor
5865    * @param wal WAL for region to use. This method will call
5866    * WAL#setSequenceNumber(long) passing the result of the call to
5867    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
5868    * up.  HRegionStore does this every time it opens a new region.
5869    * @param conf The Configuration object to use.
5870    * @param rsServices An interface we can request flushes against.
5871    * @param reporter An interface we can report progress against.
5872    * @return new HRegion
5873    *
5874    * @throws IOException
5875    */
5876   public static HRegion openHRegion(final HRegionInfo info,
5877     final HTableDescriptor htd, final WAL wal, final Configuration conf,
5878     final RegionServerServices rsServices,
5879     final CancelableProgressable reporter)
5880   throws IOException {
5881     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
5882   }
5883 
5884   /**
5885    * Open a Region.
5886    * @param rootDir Root directory for HBase instance
5887    * @param info Info for region to be opened.
5888    * @param htd the table descriptor
5889    * @param wal WAL for region to use. This method will call
5890    * WAL#setSequenceNumber(long) passing the result of the call to
5891    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
5892    * up.  HRegionStore does this every time it opens a new region.
5893    * @param conf The Configuration object to use.
5894    * @return new HRegion
5895    * @throws IOException
5896    */
5897   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
5898       final HTableDescriptor htd, final WAL wal, final Configuration conf)
5899   throws IOException {
5900     return openHRegion(rootDir, info, htd, wal, conf, null, null);
5901   }
5902 
5903   /**
5904    * Open a Region.
5905    * @param rootDir Root directory for HBase instance
5906    * @param info Info for region to be opened.
5907    * @param htd the table descriptor
5908    * @param wal WAL for region to use. This method will call
5909    * WAL#setSequenceNumber(long) passing the result of the call to
5910    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
5911    * up.  HRegionStore does this every time it opens a new region.
5912    * @param conf The Configuration object to use.
5913    * @param rsServices An interface we can request flushes against.
5914    * @param reporter An interface we can report progress against.
5915    * @return new HRegion
5916    * @throws IOException
5917    */
5918   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
5919       final HTableDescriptor htd, final WAL wal, final Configuration conf,
5920       final RegionServerServices rsServices,
5921       final CancelableProgressable reporter)
5922   throws IOException {
5923     FileSystem fs = null;
5924     if (rsServices != null) {
5925       fs = rsServices.getFileSystem();
5926     }
5927     if (fs == null) {
5928       fs = FileSystem.get(conf);
5929     }
5930     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
5931   }
5932 
5933   /**
5934    * Open a Region.
5935    * @param conf The Configuration object to use.
5936    * @param fs Filesystem to use
5937    * @param rootDir Root directory for HBase instance
5938    * @param info Info for region to be opened.
5939    * @param htd the table descriptor
5940    * @param wal WAL for region to use. This method will call
5941    * WAL#setSequenceNumber(long) passing the result of the call to
5942    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
5943    * up.  HRegionStore does this every time it opens a new region.
5944    * @return new HRegion
5945    * @throws IOException
5946    */
5947   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
5948       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal)
5949       throws IOException {
5950     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
5951   }
5952 
5953   /**
5954    * Open a Region.
5955    * @param conf The Configuration object to use.
5956    * @param fs Filesystem to use
5957    * @param rootDir Root directory for HBase instance
5958    * @param info Info for region to be opened.
5959    * @param htd the table descriptor
5960    * @param wal WAL for region to use. This method will call
5961    * WAL#setSequenceNumber(long) passing the result of the call to
5962    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
5963    * up.  HRegionStore does this every time it opens a new region.
5964    * @param rsServices An interface we can request flushes against.
5965    * @param reporter An interface we can report progress against.
5966    * @return new HRegion
5967    * @throws IOException
5968    */
5969   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
5970       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal,
5971       final RegionServerServices rsServices, final CancelableProgressable reporter)
5972       throws IOException {
5973     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
5974     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
5975   }
5976 
5977   /**
5978    * Open a Region.
5979    * @param conf The Configuration object to use.
5980    * @param fs Filesystem to use
5981    * @param rootDir Root directory for HBase instance
5982    * @param info Info for region to be opened.
5983    * @param htd the table descriptor
5984    * @param wal WAL for region to use. This method will call
5985    * WAL#setSequenceNumber(long) passing the result of the call to
5986    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
5987    * up.  HRegionStore does this every time it opens a new region.
5988    * @param rsServices An interface we can request flushes against.
5989    * @param reporter An interface we can report progress against.
5990    * @return new HRegion
5991    * @throws IOException
5992    */
5993   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
5994       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd,
5995       final WAL wal, final RegionServerServices rsServices,
5996       final CancelableProgressable reporter)
5997       throws IOException {
5998     if (info == null) throw new NullPointerException("Passed region info is null");
5999     if (LOG.isDebugEnabled()) {
6000       LOG.debug("Opening region: " + info);
6001     }
6002     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6003     return r.openHRegion(reporter);
6004   }
6005 
6006 
6007   /**
6008    * Useful when reopening a closed region (normally for unit tests)
6009    * @param other original object
6010    * @param reporter An interface we can report progress against.
6011    * @return new HRegion
6012    * @throws IOException
6013    */
6014   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
6015       throws IOException {
6016     HRegionFileSystem regionFs = other.getRegionFileSystem();
6017     HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
6018         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
6019     return r.openHRegion(reporter);
6020   }
6021 
6022   public static Region openHRegion(final Region other, final CancelableProgressable reporter)
6023         throws IOException {
6024     return openHRegion((HRegion)other, reporter);
6025   }
6026 
6027   /**
6028    * Open HRegion.
6029    * Calls initialize and sets sequenceId.
6030    * @return Returns <code>this</code>
6031    * @throws IOException
6032    */
6033   protected HRegion openHRegion(final CancelableProgressable reporter)
6034   throws IOException {
6035     // Refuse to open the region if we are missing local compression support
6036     checkCompressionCodecs();
6037     // Refuse to open the region if encryption configuration is incorrect or
6038     // codec support is missing
6039     checkEncryption();
6040     // Refuse to open the region if a required class cannot be loaded
6041     checkClassLoading();
6042     this.openSeqNum = initialize(reporter);
6043     this.setSequenceId(openSeqNum);
6044     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
6045         && !recovering) {
6046       // Only write the region open event marker to WAL if (1) we are not read-only
6047       // (2) dist log replay is off or we are not recovering. In case region is
6048       // recovering, the open event will be written at setRecovering(false)
6049       writeRegionOpenMarker(wal, openSeqNum);
6050     }
6051     return this;
6052   }
6053 
6054   public static void warmupHRegion(final HRegionInfo info,
6055       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6056       final RegionServerServices rsServices,
6057       final CancelableProgressable reporter)
6058       throws IOException {
6059 
6060     if (info == null) throw new NullPointerException("Passed region info is null");
6061 
6062     if (LOG.isDebugEnabled()) {
6063       LOG.debug("HRegion.Warming up region: " + info);
6064     }
6065 
6066     Path rootDir = FSUtils.getRootDir(conf);
6067     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6068 
6069     FileSystem fs = null;
6070     if (rsServices != null) {
6071       fs = rsServices.getFileSystem();
6072     }
6073     if (fs == null) {
6074       fs = FileSystem.get(conf);
6075     }
6076 
6077     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6078     r.initializeWarmup(reporter);
6079     r.close();
6080   }
6081 
6082 
6083   private void checkCompressionCodecs() throws IOException {
6084     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6085       CompressionTest.testCompression(fam.getCompression());
6086       CompressionTest.testCompression(fam.getCompactionCompression());
6087     }
6088   }
6089 
6090   private void checkEncryption() throws IOException {
6091     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6092       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
6093     }
6094   }
6095 
6096   private void checkClassLoading() throws IOException {
6097     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
6098     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
6099   }
6100 
6101   /**
6102    * Create a daughter region from given a temp directory with the region data.
6103    * @param hri Spec. for daughter region to open.
6104    * @throws IOException
6105    */
6106   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
6107     // Move the files from the temporary .splits to the final /table/region directory
6108     fs.commitDaughterRegion(hri);
6109 
6110     // Create the daughter HRegion instance
6111     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(), fs.getFileSystem(),
6112         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
6113     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
6114     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
6115     return r;
6116   }
6117 
6118   /**
6119    * Create a merged region given a temp directory with the region data.
6120    * @param region_b another merging region
6121    * @return merged HRegion
6122    * @throws IOException
6123    */
6124   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
6125       final HRegion region_b) throws IOException {
6126     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(),
6127         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
6128         this.getTableDesc(), this.rsServices);
6129     r.readRequestsCount.set(this.getReadRequestsCount()
6130         + region_b.getReadRequestsCount());
6131     r.writeRequestsCount.set(this.getWriteRequestsCount()
6132 
6133         + region_b.getWriteRequestsCount());
6134     this.fs.commitMergedRegion(mergedRegionInfo);
6135     return r;
6136   }
6137 
6138   /**
6139    * Inserts a new region's meta information into the passed
6140    * <code>meta</code> region. Used by the HMaster bootstrap code adding
6141    * new table to hbase:meta table.
6142    *
6143    * @param meta hbase:meta HRegion to be updated
6144    * @param r HRegion to add to <code>meta</code>
6145    *
6146    * @throws IOException
6147    */
6148   // TODO remove since only test and merge use this
6149   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
6150     meta.checkResources();
6151     // The row key is the region name
6152     byte[] row = r.getRegionInfo().getRegionName();
6153     final long now = EnvironmentEdgeManager.currentTime();
6154     final List<Cell> cells = new ArrayList<Cell>(2);
6155     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6156       HConstants.REGIONINFO_QUALIFIER, now,
6157       r.getRegionInfo().toByteArray()));
6158     // Set into the root table the version of the meta table.
6159     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6160       HConstants.META_VERSION_QUALIFIER, now,
6161       Bytes.toBytes(HConstants.META_VERSION)));
6162     meta.put(row, HConstants.CATALOG_FAMILY, cells);
6163   }
6164 
6165   /**
6166    * Computes the Path of the HRegion
6167    *
6168    * @param tabledir qualified path for table
6169    * @param name ENCODED region name
6170    * @return Path of HRegion directory
6171    */
6172   @Deprecated
6173   public static Path getRegionDir(final Path tabledir, final String name) {
6174     return new Path(tabledir, name);
6175   }
6176 
6177   /**
6178    * Computes the Path of the HRegion
6179    *
6180    * @param rootdir qualified path of HBase root directory
6181    * @param info HRegionInfo for the region
6182    * @return qualified path of region directory
6183    */
6184   @Deprecated
6185   @VisibleForTesting
6186   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
6187     return new Path(
6188       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
6189   }
6190 
6191   /**
6192    * Determines if the specified row is within the row range specified by the
6193    * specified HRegionInfo
6194    *
6195    * @param info HRegionInfo that specifies the row range
6196    * @param row row to be checked
6197    * @return true if the row is within the range specified by the HRegionInfo
6198    */
6199   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
6200     return ((info.getStartKey().length == 0) ||
6201         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
6202         ((info.getEndKey().length == 0) ||
6203             (Bytes.compareTo(info.getEndKey(), row) > 0));
6204   }
6205 
6206   /**
6207    * Merge two HRegions.  The regions must be adjacent and must not overlap.
6208    *
6209    * @return new merged HRegion
6210    * @throws IOException
6211    */
6212   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
6213   throws IOException {
6214     HRegion a = srcA;
6215     HRegion b = srcB;
6216 
6217     // Make sure that srcA comes first; important for key-ordering during
6218     // write of the merged file.
6219     if (srcA.getRegionInfo().getStartKey() == null) {
6220       if (srcB.getRegionInfo().getStartKey() == null) {
6221         throw new IOException("Cannot merge two regions with null start key");
6222       }
6223       // A's start key is null but B's isn't. Assume A comes before B
6224     } else if ((srcB.getRegionInfo().getStartKey() == null) ||
6225       (Bytes.compareTo(srcA.getRegionInfo().getStartKey(),
6226         srcB.getRegionInfo().getStartKey()) > 0)) {
6227       a = srcB;
6228       b = srcA;
6229     }
6230 
6231     if (!(Bytes.compareTo(a.getRegionInfo().getEndKey(),
6232         b.getRegionInfo().getStartKey()) == 0)) {
6233       throw new IOException("Cannot merge non-adjacent regions");
6234     }
6235     return merge(a, b);
6236   }
6237 
6238   /**
6239    * Merge two regions whether they are adjacent or not.
6240    *
6241    * @param a region a
6242    * @param b region b
6243    * @return new merged region
6244    * @throws IOException
6245    */
6246   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
6247     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
6248       throw new IOException("Regions do not belong to the same table");
6249     }
6250 
6251     FileSystem fs = a.getRegionFileSystem().getFileSystem();
6252     // Make sure each region's cache is empty
6253     a.flush(true);
6254     b.flush(true);
6255 
6256     // Compact each region so we only have one store file per family
6257     a.compact(true);
6258     if (LOG.isDebugEnabled()) {
6259       LOG.debug("Files for region: " + a);
6260       a.getRegionFileSystem().logFileSystemState(LOG);
6261     }
6262     b.compact(true);
6263     if (LOG.isDebugEnabled()) {
6264       LOG.debug("Files for region: " + b);
6265       b.getRegionFileSystem().logFileSystemState(LOG);
6266     }
6267 
6268     RegionMergeTransactionImpl rmt = new RegionMergeTransactionImpl(a, b, true);
6269     if (!rmt.prepare(null)) {
6270       throw new IOException("Unable to merge regions " + a + " and " + b);
6271     }
6272     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
6273     LOG.info("starting merge of regions: " + a + " and " + b
6274         + " into new region " + mergedRegionInfo.getRegionNameAsString()
6275         + " with start key <"
6276         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
6277         + "> and end key <"
6278         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
6279     HRegion dstRegion;
6280     try {
6281       dstRegion = (HRegion)rmt.execute(null, null);
6282     } catch (IOException ioe) {
6283       rmt.rollback(null, null);
6284       throw new IOException("Failed merging region " + a + " and " + b
6285           + ", and successfully rolled back");
6286     }
6287     dstRegion.compact(true);
6288 
6289     if (LOG.isDebugEnabled()) {
6290       LOG.debug("Files for new region");
6291       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
6292     }
6293 
6294     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
6295       throw new IOException("Merged region " + dstRegion
6296           + " still has references after the compaction, is compaction canceled?");
6297     }
6298 
6299     // Archiving the 'A' region
6300     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
6301     // Archiving the 'B' region
6302     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
6303 
6304     LOG.info("merge completed. New region is " + dstRegion);
6305     return dstRegion;
6306   }
6307 
6308   @Override
6309   public Result get(final Get get) throws IOException {
6310     checkRow(get.getRow(), "Get");
6311     // Verify families are all valid
6312     if (get.hasFamilies()) {
6313       for (byte [] family: get.familySet()) {
6314         checkFamily(family);
6315       }
6316     } else { // Adding all families to scanner
6317       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
6318         get.addFamily(family);
6319       }
6320     }
6321     List<Cell> results = get(get, true);
6322     boolean stale = this.getRegionInfo().getReplicaId() != 0;
6323     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
6324   }
6325 
6326   @Override
6327   public List<Cell> get(Get get, boolean withCoprocessor) throws IOException {
6328 
6329     List<Cell> results = new ArrayList<Cell>();
6330 
6331     // pre-get CP hook
6332     if (withCoprocessor && (coprocessorHost != null)) {
6333        if (coprocessorHost.preGet(get, results)) {
6334          return results;
6335        }
6336     }
6337 
6338     Scan scan = new Scan(get);
6339 
6340     RegionScanner scanner = null;
6341     try {
6342       scanner = getScanner(scan);
6343       scanner.next(results);
6344     } finally {
6345       if (scanner != null)
6346         scanner.close();
6347     }
6348 
6349     // post-get CP hook
6350     if (withCoprocessor && (coprocessorHost != null)) {
6351       coprocessorHost.postGet(get, results);
6352     }
6353 
6354     // do after lock
6355     if (this.metricsRegion != null) {
6356       long totalSize = 0L;
6357       for (Cell cell : results) {
6358         totalSize += CellUtil.estimatedSerializedSizeOf(cell);
6359       }
6360       this.metricsRegion.updateGet(totalSize);
6361     }
6362 
6363     return results;
6364   }
6365 
6366   public void mutateRow(RowMutations rm) throws IOException {
6367     // Don't need nonces here - RowMutations only supports puts and deletes
6368     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
6369   }
6370 
6371   /**
6372    * Perform atomic mutations within the region w/o nonces.
6373    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
6374    */
6375   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6376       Collection<byte[]> rowsToLock) throws IOException {
6377     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
6378   }
6379 
6380   /**
6381    * Perform atomic mutations within the region.
6382    * @param mutations The list of mutations to perform.
6383    * <code>mutations</code> can contain operations for multiple rows.
6384    * Caller has to ensure that all rows are contained in this region.
6385    * @param rowsToLock Rows to lock
6386    * @param nonceGroup Optional nonce group of the operation (client Id)
6387    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6388    * If multiple rows are locked care should be taken that
6389    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
6390    * @throws IOException
6391    */
6392   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6393       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
6394     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
6395     processRowsWithLocks(proc, -1, nonceGroup, nonce);
6396   }
6397 
6398   /**
6399    * @return the current load statistics for the the region
6400    */
6401   public ClientProtos.RegionLoadStats getRegionStats() {
6402     if (!regionStatsEnabled) {
6403       return null;
6404     }
6405     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
6406     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
6407         .memstoreFlushSize)));
6408     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
6409     return stats.build();
6410   }
6411 
6412   @Override
6413   public void processRowsWithLocks(RowProcessor<?,?> processor) throws IOException {
6414     processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE,
6415       HConstants.NO_NONCE);
6416   }
6417 
6418   @Override
6419   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
6420       throws IOException {
6421     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
6422   }
6423 
6424   @Override
6425   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
6426       long nonceGroup, long nonce) throws IOException {
6427 
6428     for (byte[] row : processor.getRowsToLock()) {
6429       checkRow(row, "processRowsWithLocks");
6430     }
6431     if (!processor.readOnly()) {
6432       checkReadOnly();
6433     }
6434     checkResources();
6435 
6436     startRegionOperation();
6437     WALEdit walEdit = new WALEdit();
6438 
6439     // 1. Run pre-process hook
6440     try {
6441       processor.preProcess(this, walEdit);
6442     } catch (IOException e) {
6443       closeRegionOperation();
6444       throw e;
6445     }
6446     // Short circuit the read only case
6447     if (processor.readOnly()) {
6448       try {
6449         long now = EnvironmentEdgeManager.currentTime();
6450         doProcessRowWithTimeout(
6451             processor, now, this, null, null, timeout);
6452         processor.postProcess(this, walEdit, true);
6453       } finally {
6454         closeRegionOperation();
6455       }
6456       return;
6457     }
6458 
6459     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
6460     boolean locked;
6461     boolean walSyncSuccessful = false;
6462     List<RowLock> acquiredRowLocks;
6463     long addedSize = 0;
6464     List<Mutation> mutations = new ArrayList<Mutation>();
6465     List<Cell> memstoreCells = new ArrayList<Cell>();
6466     Collection<byte[]> rowsToLock = processor.getRowsToLock();
6467     long mvccNum = 0;
6468     WALKey walKey = null;
6469     try {
6470       // 2. Acquire the row lock(s)
6471       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
6472       for (byte[] row : rowsToLock) {
6473         // Attempt to lock all involved rows, throw if any lock times out
6474         acquiredRowLocks.add(getRowLock(row));
6475       }
6476       // 3. Region lock
6477       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
6478       locked = true;
6479       // Get a mvcc write number
6480       mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
6481 
6482       long now = EnvironmentEdgeManager.currentTime();
6483       try {
6484         // 4. Let the processor scan the rows, generate mutations and add
6485         //    waledits
6486         doProcessRowWithTimeout(
6487             processor, now, this, mutations, walEdit, timeout);
6488 
6489         if (!mutations.isEmpty()) {
6490           // 5. Start mvcc transaction
6491           writeEntry = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
6492           // 6. Call the preBatchMutate hook
6493           processor.preBatchMutate(this, walEdit);
6494           // 7. Apply to memstore
6495           for (Mutation m : mutations) {
6496             // Handle any tag based cell features
6497             rewriteCellTags(m.getFamilyCellMap(), m);
6498 
6499             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
6500               Cell cell = cellScanner.current();
6501               CellUtil.setSequenceId(cell, mvccNum);
6502               Store store = getStore(cell);
6503               if (store == null) {
6504                 checkFamily(CellUtil.cloneFamily(cell));
6505                 // unreachable
6506               }
6507               Pair<Long, Cell> ret = store.add(cell);
6508               addedSize += ret.getFirst();
6509               memstoreCells.add(ret.getSecond());
6510             }
6511           }
6512 
6513           long txid = 0;
6514           // 8. Append no sync
6515           if (!walEdit.isEmpty()) {
6516             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
6517             walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
6518               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
6519               processor.getClusterIds(), nonceGroup, nonce);
6520             txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
6521               walKey, walEdit, getSequenceId(), true, memstoreCells);
6522           }
6523           if(walKey == null){
6524             // since we use wal sequence Id as mvcc, for SKIP_WAL changes we need a "faked" WALEdit
6525             // to get a sequence id assigned which is done by FSWALEntry#stampRegionSequenceId
6526             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
6527           }
6528           // 9. Release region lock
6529           if (locked) {
6530             this.updatesLock.readLock().unlock();
6531             locked = false;
6532           }
6533 
6534           // 10. Release row lock(s)
6535           releaseRowLocks(acquiredRowLocks);
6536 
6537           // 11. Sync edit log
6538           if (txid != 0) {
6539             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
6540           }
6541           walSyncSuccessful = true;
6542           // 12. call postBatchMutate hook
6543           processor.postBatchMutate(this);
6544         }
6545       } finally {
6546         if (!mutations.isEmpty() && !walSyncSuccessful) {
6547           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
6548               " memstore keyvalues for row(s):" + StringUtils.byteToHexString(
6549               processor.getRowsToLock().iterator().next()) + "...");
6550           for (Mutation m : mutations) {
6551             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
6552               Cell cell = cellScanner.current();
6553               getStore(cell).rollback(cell);
6554             }
6555           }
6556         }
6557         // 13. Roll mvcc forward
6558         if (writeEntry != null) {
6559           mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
6560         }
6561         if (locked) {
6562           this.updatesLock.readLock().unlock();
6563         }
6564         // release locks if some were acquired but another timed out
6565         releaseRowLocks(acquiredRowLocks);
6566       }
6567 
6568       // 14. Run post-process hook
6569       processor.postProcess(this, walEdit, walSyncSuccessful);
6570 
6571     } finally {
6572       closeRegionOperation();
6573       if (!mutations.isEmpty() &&
6574           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
6575         requestFlush();
6576       }
6577     }
6578   }
6579 
6580   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
6581                                        final long now,
6582                                        final HRegion region,
6583                                        final List<Mutation> mutations,
6584                                        final WALEdit walEdit,
6585                                        final long timeout) throws IOException {
6586     // Short circuit the no time bound case.
6587     if (timeout < 0) {
6588       try {
6589         processor.process(now, region, mutations, walEdit);
6590       } catch (IOException e) {
6591         LOG.warn("RowProcessor:" + processor.getClass().getName() +
6592             " throws Exception on row(s):" +
6593             Bytes.toStringBinary(
6594               processor.getRowsToLock().iterator().next()) + "...", e);
6595         throw e;
6596       }
6597       return;
6598     }
6599 
6600     // Case with time bound
6601     FutureTask<Void> task =
6602       new FutureTask<Void>(new Callable<Void>() {
6603         @Override
6604         public Void call() throws IOException {
6605           try {
6606             processor.process(now, region, mutations, walEdit);
6607             return null;
6608           } catch (IOException e) {
6609             LOG.warn("RowProcessor:" + processor.getClass().getName() +
6610                 " throws Exception on row(s):" +
6611                 Bytes.toStringBinary(
6612                     processor.getRowsToLock().iterator().next()) + "...", e);
6613             throw e;
6614           }
6615         }
6616       });
6617     rowProcessorExecutor.execute(task);
6618     try {
6619       task.get(timeout, TimeUnit.MILLISECONDS);
6620     } catch (TimeoutException te) {
6621       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
6622           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
6623           "...");
6624       throw new IOException(te);
6625     } catch (Exception e) {
6626       throw new IOException(e);
6627     }
6628   }
6629 
6630   public Result append(Append append) throws IOException {
6631     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
6632   }
6633 
6634   // TODO: There's a lot of boiler plate code identical to increment.
6635   // We should refactor append and increment as local get-mutate-put
6636   // transactions, so all stores only go through one code path for puts.
6637 
6638   @Override
6639   public Result append(Append append, long nonceGroup, long nonce) throws IOException {
6640     byte[] row = append.getRow();
6641     checkRow(row, "append");
6642     boolean flush = false;
6643     Durability durability = getEffectiveDurability(append.getDurability());
6644     boolean writeToWAL = durability != Durability.SKIP_WAL;
6645     WALEdit walEdits = null;
6646     List<Cell> allKVs = new ArrayList<Cell>(append.size());
6647     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
6648     long size = 0;
6649     long txid = 0;
6650 
6651     checkReadOnly();
6652     checkResources();
6653     // Lock row
6654     startRegionOperation(Operation.APPEND);
6655     this.writeRequestsCount.increment();
6656     long mvccNum = 0;
6657     WriteEntry w = null;
6658     WALKey walKey = null;
6659     RowLock rowLock = null;
6660     List<Cell> memstoreCells = new ArrayList<Cell>();
6661     boolean doRollBackMemstore = false;
6662     try {
6663       rowLock = getRowLock(row);
6664       try {
6665         lock(this.updatesLock.readLock());
6666         try {
6667           // wait for all prior MVCC transactions to finish - while we hold the row lock
6668           // (so that we are guaranteed to see the latest state)
6669           mvcc.waitForPreviousTransactionsComplete();
6670           if (this.coprocessorHost != null) {
6671             Result r = this.coprocessorHost.preAppendAfterRowLock(append);
6672             if(r!= null) {
6673               return r;
6674             }
6675           }
6676           // now start my own transaction
6677           mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
6678           w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
6679           long now = EnvironmentEdgeManager.currentTime();
6680           // Process each family
6681           for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
6682 
6683             Store store = stores.get(family.getKey());
6684             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
6685 
6686             // Sort the cells so that they match the order that they
6687             // appear in the Get results. Otherwise, we won't be able to
6688             // find the existing values if the cells are not specified
6689             // in order by the client since cells are in an array list.
6690             Collections.sort(family.getValue(), store.getComparator());
6691             // Get previous values for all columns in this family
6692             Get get = new Get(row);
6693             for (Cell cell : family.getValue()) {
6694               get.addColumn(family.getKey(), CellUtil.cloneQualifier(cell));
6695             }
6696             List<Cell> results = get(get, false);
6697             // Iterate the input columns and update existing values if they were
6698             // found, otherwise add new column initialized to the append value
6699 
6700             // Avoid as much copying as possible. We may need to rewrite and
6701             // consolidate tags. Bytes are only copied once.
6702             // Would be nice if KeyValue had scatter/gather logic
6703             int idx = 0;
6704             for (Cell cell : family.getValue()) {
6705               Cell newCell;
6706               Cell oldCell = null;
6707               if (idx < results.size()
6708                   && CellUtil.matchingQualifier(results.get(idx), cell)) {
6709                 oldCell = results.get(idx);
6710                 long ts = Math.max(now, oldCell.getTimestamp());
6711 
6712                 // Process cell tags
6713                 List<Tag> newTags = new ArrayList<Tag>();
6714 
6715                 // Make a union of the set of tags in the old and new KVs
6716 
6717                 if (oldCell.getTagsLength() > 0) {
6718                   Iterator<Tag> i = CellUtil.tagsIterator(oldCell.getTagsArray(),
6719                     oldCell.getTagsOffset(), oldCell.getTagsLength());
6720                   while (i.hasNext()) {
6721                     newTags.add(i.next());
6722                   }
6723                 }
6724                 if (cell.getTagsLength() > 0) {
6725                   Iterator<Tag> i  = CellUtil.tagsIterator(cell.getTagsArray(),
6726                     cell.getTagsOffset(), cell.getTagsLength());
6727                   while (i.hasNext()) {
6728                     newTags.add(i.next());
6729                   }
6730                 }
6731 
6732                 // Cell TTL handling
6733 
6734                 if (append.getTTL() != Long.MAX_VALUE) {
6735                   // Add the new TTL tag
6736                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
6737                 }
6738 
6739                 // Rebuild tags
6740                 byte[] tagBytes = Tag.fromList(newTags);
6741 
6742                 // allocate an empty cell once
6743                 newCell = new KeyValue(row.length, cell.getFamilyLength(),
6744                     cell.getQualifierLength(), ts, KeyValue.Type.Put,
6745                     oldCell.getValueLength() + cell.getValueLength(),
6746                     tagBytes.length);
6747                 // copy in row, family, and qualifier
6748                 System.arraycopy(cell.getRowArray(), cell.getRowOffset(),
6749                   newCell.getRowArray(), newCell.getRowOffset(), cell.getRowLength());
6750                 System.arraycopy(cell.getFamilyArray(), cell.getFamilyOffset(),
6751                   newCell.getFamilyArray(), newCell.getFamilyOffset(),
6752                   cell.getFamilyLength());
6753                 System.arraycopy(cell.getQualifierArray(), cell.getQualifierOffset(),
6754                   newCell.getQualifierArray(), newCell.getQualifierOffset(),
6755                   cell.getQualifierLength());
6756                 // copy in the value
6757                 System.arraycopy(oldCell.getValueArray(), oldCell.getValueOffset(),
6758                   newCell.getValueArray(), newCell.getValueOffset(),
6759                   oldCell.getValueLength());
6760                 System.arraycopy(cell.getValueArray(), cell.getValueOffset(),
6761                   newCell.getValueArray(),
6762                   newCell.getValueOffset() + oldCell.getValueLength(),
6763                   cell.getValueLength());
6764                 // Copy in tag data
6765                 System.arraycopy(tagBytes, 0, newCell.getTagsArray(), newCell.getTagsOffset(),
6766                   tagBytes.length);
6767                 idx++;
6768               } else {
6769                 // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP
6770                 CellUtil.updateLatestStamp(cell, now);
6771 
6772                 // Cell TTL handling
6773 
6774                 if (append.getTTL() != Long.MAX_VALUE) {
6775                   List<Tag> newTags = new ArrayList<Tag>(1);
6776                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
6777                   // Add the new TTL tag
6778                   newCell = new KeyValue(cell.getRowArray(), cell.getRowOffset(),
6779                       cell.getRowLength(),
6780                     cell.getFamilyArray(), cell.getFamilyOffset(),
6781                       cell.getFamilyLength(),
6782                     cell.getQualifierArray(), cell.getQualifierOffset(),
6783                       cell.getQualifierLength(),
6784                     cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
6785                     cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
6786                     newTags);
6787                 } else {
6788                   newCell = cell;
6789                 }
6790               }
6791 
6792               CellUtil.setSequenceId(newCell, mvccNum);
6793               // Give coprocessors a chance to update the new cell
6794               if (coprocessorHost != null) {
6795                 newCell = coprocessorHost.postMutationBeforeWAL(RegionObserver.MutationType.APPEND,
6796                     append, oldCell, newCell);
6797               }
6798               kvs.add(newCell);
6799 
6800               // Append update to WAL
6801               if (writeToWAL) {
6802                 if (walEdits == null) {
6803                   walEdits = new WALEdit();
6804                 }
6805                 walEdits.add(newCell);
6806               }
6807             }
6808 
6809             //store the kvs to the temporary memstore before writing WAL
6810             tempMemstore.put(store, kvs);
6811           }
6812 
6813           //Actually write to Memstore now
6814           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
6815             Store store = entry.getKey();
6816             if (store.getFamily().getMaxVersions() == 1) {
6817               // upsert if VERSIONS for this CF == 1
6818               size += store.upsert(entry.getValue(), getSmallestReadPoint());
6819               memstoreCells.addAll(entry.getValue());
6820             } else {
6821               // otherwise keep older versions around
6822               for (Cell cell: entry.getValue()) {
6823                 Pair<Long, Cell> ret = store.add(cell);
6824                 size += ret.getFirst();
6825                 memstoreCells.add(ret.getSecond());
6826                 doRollBackMemstore = true;
6827               }
6828             }
6829             allKVs.addAll(entry.getValue());
6830           }
6831 
6832           // Actually write to WAL now
6833           if (writeToWAL) {
6834             // Using default cluster id, as this can only happen in the originating
6835             // cluster. A slave cluster receives the final value (not the delta)
6836             // as a Put.
6837             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
6838             walKey = new HLogKey(getRegionInfo().getEncodedNameAsBytes(),
6839               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, nonceGroup, nonce);
6840             txid = this.wal.append(this.htableDescriptor, getRegionInfo(), walKey, walEdits,
6841               this.sequenceId, true, memstoreCells);
6842           } else {
6843             recordMutationWithoutWal(append.getFamilyCellMap());
6844           }
6845           if (walKey == null) {
6846             // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
6847             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
6848           }
6849           size = this.addAndGetGlobalMemstoreSize(size);
6850           flush = isFlushSize(size);
6851         } finally {
6852           this.updatesLock.readLock().unlock();
6853         }
6854       } finally {
6855         rowLock.release();
6856         rowLock = null;
6857       }
6858       // sync the transaction log outside the rowlock
6859       if(txid != 0){
6860         syncOrDefer(txid, durability);
6861       }
6862       doRollBackMemstore = false;
6863     } finally {
6864       if (rowLock != null) {
6865         rowLock.release();
6866       }
6867       // if the wal sync was unsuccessful, remove keys from memstore
6868       if (doRollBackMemstore) {
6869         rollbackMemstore(memstoreCells);
6870       }
6871       if (w != null) {
6872         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
6873       }
6874       closeRegionOperation(Operation.APPEND);
6875     }
6876 
6877     if (this.metricsRegion != null) {
6878       this.metricsRegion.updateAppend();
6879     }
6880 
6881     if (flush) {
6882       // Request a cache flush. Do it outside update lock.
6883       requestFlush();
6884     }
6885 
6886 
6887     return append.isReturnResults() ? Result.create(allKVs) : null;
6888   }
6889 
6890   public Result increment(Increment increment) throws IOException {
6891     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
6892   }
6893 
6894   // TODO: There's a lot of boiler plate code identical to append.
6895   // We should refactor append and increment as local get-mutate-put
6896   // transactions, so all stores only go through one code path for puts.
6897 
6898   @Override
6899   public Result increment(Increment increment, long nonceGroup, long nonce)
6900   throws IOException {
6901     byte [] row = increment.getRow();
6902     checkRow(row, "increment");
6903     TimeRange tr = increment.getTimeRange();
6904     boolean flush = false;
6905     Durability durability = getEffectiveDurability(increment.getDurability());
6906     boolean writeToWAL = durability != Durability.SKIP_WAL;
6907     WALEdit walEdits = null;
6908     List<Cell> allKVs = new ArrayList<Cell>(increment.size());
6909     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
6910 
6911     long size = 0;
6912     long txid = 0;
6913 
6914     checkReadOnly();
6915     checkResources();
6916     // Lock row
6917     startRegionOperation(Operation.INCREMENT);
6918     this.writeRequestsCount.increment();
6919     RowLock rowLock = null;
6920     WriteEntry w = null;
6921     WALKey walKey = null;
6922     long mvccNum = 0;
6923     List<Cell> memstoreCells = new ArrayList<Cell>();
6924     boolean doRollBackMemstore = false;
6925     try {
6926       rowLock = getRowLock(row);
6927       try {
6928         lock(this.updatesLock.readLock());
6929         try {
6930           // wait for all prior MVCC transactions to finish - while we hold the row lock
6931           // (so that we are guaranteed to see the latest state)
6932           mvcc.waitForPreviousTransactionsComplete();
6933           if (this.coprocessorHost != null) {
6934             Result r = this.coprocessorHost.preIncrementAfterRowLock(increment);
6935             if (r != null) {
6936               return r;
6937             }
6938           }
6939           // now start my own transaction
6940           mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
6941           w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
6942           long now = EnvironmentEdgeManager.currentTime();
6943           // Process each family
6944           for (Map.Entry<byte [], List<Cell>> family:
6945               increment.getFamilyCellMap().entrySet()) {
6946 
6947             Store store = stores.get(family.getKey());
6948             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
6949 
6950             // Sort the cells so that they match the order that they
6951             // appear in the Get results. Otherwise, we won't be able to
6952             // find the existing values if the cells are not specified
6953             // in order by the client since cells are in an array list.
6954             Collections.sort(family.getValue(), store.getComparator());
6955             // Get previous values for all columns in this family
6956             Get get = new Get(row);
6957             for (Cell cell: family.getValue()) {
6958               get.addColumn(family.getKey(),  CellUtil.cloneQualifier(cell));
6959             }
6960             get.setTimeRange(tr.getMin(), tr.getMax());
6961             List<Cell> results = get(get, false);
6962 
6963             // Iterate the input columns and update existing values if they were
6964             // found, otherwise add new column initialized to the increment amount
6965             int idx = 0;
6966             List<Cell> edits = family.getValue();
6967             for (int i = 0; i < edits.size(); i++) {
6968               Cell cell = edits.get(i);
6969               long amount = Bytes.toLong(CellUtil.cloneValue(cell));
6970               boolean noWriteBack = (amount == 0);
6971               List<Tag> newTags = new ArrayList<Tag>();
6972 
6973               // Carry forward any tags that might have been added by a coprocessor
6974               if (cell.getTagsLength() > 0) {
6975                 Iterator<Tag> itr = CellUtil.tagsIterator(cell.getTagsArray(),
6976                   cell.getTagsOffset(), cell.getTagsLength());
6977                 while (itr.hasNext()) {
6978                   newTags.add(itr.next());
6979                 }
6980               }
6981 
6982               Cell c = null;
6983               long ts = now;
6984               if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), cell)) {
6985                 c = results.get(idx);
6986                 ts = Math.max(now, c.getTimestamp());
6987                 if(c.getValueLength() == Bytes.SIZEOF_LONG) {
6988                   amount += Bytes.toLong(c.getValueArray(), c.getValueOffset(), Bytes.SIZEOF_LONG);
6989                 } else {
6990                   // throw DoNotRetryIOException instead of IllegalArgumentException
6991                   throw new org.apache.hadoop.hbase.DoNotRetryIOException(
6992                       "Attempted to increment field that isn't 64 bits wide");
6993                 }
6994                 // Carry tags forward from previous version
6995                 if (c.getTagsLength() > 0) {
6996                   Iterator<Tag> itr = CellUtil.tagsIterator(c.getTagsArray(),
6997                     c.getTagsOffset(), c.getTagsLength());
6998                   while (itr.hasNext()) {
6999                     newTags.add(itr.next());
7000                   }
7001                 }
7002                 if (i < ( edits.size() - 1) && !CellUtil.matchingQualifier(cell, edits.get(i + 1)))
7003                   idx++;
7004               }
7005 
7006               // Append new incremented KeyValue to list
7007               byte[] q = CellUtil.cloneQualifier(cell);
7008               byte[] val = Bytes.toBytes(amount);
7009 
7010               // Add the TTL tag if the mutation carried one
7011               if (increment.getTTL() != Long.MAX_VALUE) {
7012                 newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(increment.getTTL())));
7013               }
7014 
7015               Cell newKV = new KeyValue(row, 0, row.length,
7016                 family.getKey(), 0, family.getKey().length,
7017                 q, 0, q.length,
7018                 ts,
7019                 KeyValue.Type.Put,
7020                 val, 0, val.length,
7021                 newTags);
7022 
7023               CellUtil.setSequenceId(newKV, mvccNum);
7024 
7025               // Give coprocessors a chance to update the new cell
7026               if (coprocessorHost != null) {
7027                 newKV = coprocessorHost.postMutationBeforeWAL(
7028                     RegionObserver.MutationType.INCREMENT, increment, c, newKV);
7029               }
7030               allKVs.add(newKV);
7031 
7032               if (!noWriteBack) {
7033                 kvs.add(newKV);
7034 
7035                 // Prepare WAL updates
7036                 if (writeToWAL) {
7037                   if (walEdits == null) {
7038                     walEdits = new WALEdit();
7039                   }
7040                   walEdits.add(newKV);
7041                 }
7042               }
7043             }
7044 
7045             //store the kvs to the temporary memstore before writing WAL
7046             if (!kvs.isEmpty()) {
7047               tempMemstore.put(store, kvs);
7048             }
7049           }
7050 
7051           //Actually write to Memstore now
7052           if (!tempMemstore.isEmpty()) {
7053             for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
7054               Store store = entry.getKey();
7055               if (store.getFamily().getMaxVersions() == 1) {
7056                 // upsert if VERSIONS for this CF == 1
7057                 size += store.upsert(entry.getValue(), getSmallestReadPoint());
7058                 memstoreCells.addAll(entry.getValue());
7059               } else {
7060                 // otherwise keep older versions around
7061                 for (Cell cell : entry.getValue()) {
7062                   Pair<Long, Cell> ret = store.add(cell);
7063                   size += ret.getFirst();
7064                   memstoreCells.add(ret.getSecond());
7065                   doRollBackMemstore = true;
7066                 }
7067               }
7068             }
7069             size = this.addAndGetGlobalMemstoreSize(size);
7070             flush = isFlushSize(size);
7071           }
7072 
7073           // Actually write to WAL now
7074           if (walEdits != null && !walEdits.isEmpty()) {
7075             if (writeToWAL) {
7076               // Using default cluster id, as this can only happen in the originating
7077               // cluster. A slave cluster receives the final value (not the delta)
7078               // as a Put.
7079               // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7080               walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
7081                 this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, nonceGroup, nonce);
7082               txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
7083                 walKey, walEdits, getSequenceId(), true, memstoreCells);
7084             } else {
7085               recordMutationWithoutWal(increment.getFamilyCellMap());
7086             }
7087           }
7088           if(walKey == null){
7089             // Append a faked WALEdit in order for SKIP_WAL updates to get mvccNum assigned
7090             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
7091           }
7092         } finally {
7093           this.updatesLock.readLock().unlock();
7094         }
7095       } finally {
7096         rowLock.release();
7097         rowLock = null;
7098       }
7099       // sync the transaction log outside the rowlock
7100       if(txid != 0){
7101         syncOrDefer(txid, durability);
7102       }
7103       doRollBackMemstore = false;
7104     } finally {
7105       if (rowLock != null) {
7106         rowLock.release();
7107       }
7108       // if the wal sync was unsuccessful, remove keys from memstore
7109       if (doRollBackMemstore) {
7110         rollbackMemstore(memstoreCells);
7111       }
7112       if (w != null) {
7113         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
7114       }
7115       closeRegionOperation(Operation.INCREMENT);
7116       if (this.metricsRegion != null) {
7117         this.metricsRegion.updateIncrement();
7118       }
7119     }
7120 
7121     if (flush) {
7122       // Request a cache flush.  Do it outside update lock.
7123       requestFlush();
7124     }
7125     return increment.isReturnResults() ? Result.create(allKVs) : null;
7126   }
7127 
7128   //
7129   // New HBASE-880 Helpers
7130   //
7131 
7132   private void checkFamily(final byte [] family)
7133   throws NoSuchColumnFamilyException {
7134     if (!this.htableDescriptor.hasFamily(family)) {
7135       throw new NoSuchColumnFamilyException("Column family " +
7136           Bytes.toString(family) + " does not exist in region " + this
7137           + " in table " + this.htableDescriptor);
7138     }
7139   }
7140 
7141   public static final long FIXED_OVERHEAD = ClassSize.align(
7142       ClassSize.OBJECT +
7143       ClassSize.ARRAY +
7144       44 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
7145       (14 * Bytes.SIZEOF_LONG) +
7146       5 * Bytes.SIZEOF_BOOLEAN);
7147 
7148   // woefully out of date - currently missing:
7149   // 1 x HashMap - coprocessorServiceHandlers
7150   // 6 x Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
7151   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
7152   //   writeRequestsCount
7153   // 1 x HRegion$WriteState - writestate
7154   // 1 x RegionCoprocessorHost - coprocessorHost
7155   // 1 x RegionSplitPolicy - splitPolicy
7156   // 1 x MetricsRegion - metricsRegion
7157   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
7158   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
7159       ClassSize.OBJECT + // closeLock
7160       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
7161       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
7162       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
7163       WriteState.HEAP_SIZE + // writestate
7164       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
7165       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
7166       MultiVersionConsistencyControl.FIXED_SIZE // mvcc
7167       + ClassSize.TREEMAP // maxSeqIdInStores
7168       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
7169       ;
7170 
7171   @Override
7172   public long heapSize() {
7173     long heapSize = DEEP_OVERHEAD;
7174     for (Store store : this.stores.values()) {
7175       heapSize += store.heapSize();
7176     }
7177     // this does not take into account row locks, recent flushes, mvcc entries, and more
7178     return heapSize;
7179   }
7180 
7181   /*
7182    * This method calls System.exit.
7183    * @param message Message to print out.  May be null.
7184    */
7185   private static void printUsageAndExit(final String message) {
7186     if (message != null && message.length() > 0) System.out.println(message);
7187     System.out.println("Usage: HRegion CATALOG_TABLE_DIR [major_compact]");
7188     System.out.println("Options:");
7189     System.out.println(" major_compact  Pass this option to major compact " +
7190       "passed region.");
7191     System.out.println("Default outputs scan of passed region.");
7192     System.exit(1);
7193   }
7194 
7195   @Override
7196   public boolean registerService(Service instance) {
7197     /*
7198      * No stacking of instances is allowed for a single service name
7199      */
7200     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
7201     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
7202       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
7203           " already registered, rejecting request from "+instance
7204       );
7205       return false;
7206     }
7207 
7208     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
7209     if (LOG.isDebugEnabled()) {
7210       LOG.debug("Registered coprocessor service: region=" +
7211           Bytes.toStringBinary(getRegionInfo().getRegionName()) +
7212           " service=" + serviceDesc.getFullName());
7213     }
7214     return true;
7215   }
7216 
7217   @Override
7218   public Message execService(RpcController controller, CoprocessorServiceCall call)
7219       throws IOException {
7220     String serviceName = call.getServiceName();
7221     String methodName = call.getMethodName();
7222     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
7223       throw new UnknownProtocolException(null,
7224           "No registered coprocessor service found for name "+serviceName+
7225           " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7226     }
7227 
7228     Service service = coprocessorServiceHandlers.get(serviceName);
7229     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
7230     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
7231     if (methodDesc == null) {
7232       throw new UnknownProtocolException(service.getClass(),
7233           "Unknown method "+methodName+" called on service "+serviceName+
7234               " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7235     }
7236 
7237     Message request = service.getRequestPrototype(methodDesc).newBuilderForType()
7238         .mergeFrom(call.getRequest()).build();
7239 
7240     if (coprocessorHost != null) {
7241       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
7242     }
7243 
7244     final Message.Builder responseBuilder =
7245         service.getResponsePrototype(methodDesc).newBuilderForType();
7246     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
7247       @Override
7248       public void run(Message message) {
7249         if (message != null) {
7250           responseBuilder.mergeFrom(message);
7251         }
7252       }
7253     });
7254 
7255     if (coprocessorHost != null) {
7256       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
7257     }
7258 
7259     return responseBuilder.build();
7260   }
7261 
7262   /*
7263    * Process table.
7264    * Do major compaction or list content.
7265    * @throws IOException
7266    */
7267   private static void processTable(final FileSystem fs, final Path p,
7268       final WALFactory walFactory, final Configuration c,
7269       final boolean majorCompact)
7270   throws IOException {
7271     HRegion region;
7272     FSTableDescriptors fst = new FSTableDescriptors(c);
7273     // Currently expects tables have one region only.
7274     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
7275       final WAL wal = walFactory.getMetaWAL(
7276           HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes());
7277       region = HRegion.newHRegion(p, wal, fs, c,
7278         HRegionInfo.FIRST_META_REGIONINFO,
7279           fst.get(TableName.META_TABLE_NAME), null);
7280     } else {
7281       throw new IOException("Not a known catalog table: " + p.toString());
7282     }
7283     try {
7284       region.initialize(null);
7285       if (majorCompact) {
7286         region.compact(true);
7287       } else {
7288         // Default behavior
7289         Scan scan = new Scan();
7290         // scan.addFamily(HConstants.CATALOG_FAMILY);
7291         RegionScanner scanner = region.getScanner(scan);
7292         try {
7293           List<Cell> kvs = new ArrayList<Cell>();
7294           boolean done;
7295           do {
7296             kvs.clear();
7297             done = scanner.next(kvs);
7298             if (kvs.size() > 0) LOG.info(kvs);
7299           } while (done);
7300         } finally {
7301           scanner.close();
7302         }
7303       }
7304     } finally {
7305       region.close();
7306     }
7307   }
7308 
7309   boolean shouldForceSplit() {
7310     return this.splitRequest;
7311   }
7312 
7313   byte[] getExplicitSplitPoint() {
7314     return this.explicitSplitPoint;
7315   }
7316 
7317   void forceSplit(byte[] sp) {
7318     // This HRegion will go away after the forced split is successful
7319     // But if a forced split fails, we need to clear forced split.
7320     this.splitRequest = true;
7321     if (sp != null) {
7322       this.explicitSplitPoint = sp;
7323     }
7324   }
7325 
7326   void clearSplit() {
7327     this.splitRequest = false;
7328     this.explicitSplitPoint = null;
7329   }
7330 
7331   /**
7332    * Give the region a chance to prepare before it is split.
7333    */
7334   protected void prepareToSplit() {
7335     // nothing
7336   }
7337 
7338   /**
7339    * Return the splitpoint. null indicates the region isn't splittable
7340    * If the splitpoint isn't explicitly specified, it will go over the stores
7341    * to find the best splitpoint. Currently the criteria of best splitpoint
7342    * is based on the size of the store.
7343    */
7344   public byte[] checkSplit() {
7345     // Can't split META
7346     if (this.getRegionInfo().isMetaTable() ||
7347         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
7348       if (shouldForceSplit()) {
7349         LOG.warn("Cannot split meta region in HBase 0.20 and above");
7350       }
7351       return null;
7352     }
7353 
7354     // Can't split region which is in recovering state
7355     if (this.isRecovering()) {
7356       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
7357       return null;
7358     }
7359 
7360     if (!splitPolicy.shouldSplit()) {
7361       return null;
7362     }
7363 
7364     byte[] ret = splitPolicy.getSplitPoint();
7365 
7366     if (ret != null) {
7367       try {
7368         checkRow(ret, "calculated split");
7369       } catch (IOException e) {
7370         LOG.error("Ignoring invalid split", e);
7371         return null;
7372       }
7373     }
7374     return ret;
7375   }
7376 
7377   /**
7378    * @return The priority that this region should have in the compaction queue
7379    */
7380   public int getCompactPriority() {
7381     int count = Integer.MAX_VALUE;
7382     for (Store store : stores.values()) {
7383       count = Math.min(count, store.getCompactPriority());
7384     }
7385     return count;
7386   }
7387 
7388 
7389   /** @return the coprocessor host */
7390   public RegionCoprocessorHost getCoprocessorHost() {
7391     return coprocessorHost;
7392   }
7393 
7394   /** @param coprocessorHost the new coprocessor host */
7395   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
7396     this.coprocessorHost = coprocessorHost;
7397   }
7398 
7399   @Override
7400   public void startRegionOperation() throws IOException {
7401     startRegionOperation(Operation.ANY);
7402   }
7403 
7404   @Override
7405   public void startRegionOperation(Operation op) throws IOException {
7406     switch (op) {
7407     case GET:  // read operations
7408     case SCAN:
7409       checkReadsEnabled();
7410     case INCREMENT: // write operations
7411     case APPEND:
7412     case SPLIT_REGION:
7413     case MERGE_REGION:
7414     case PUT:
7415     case DELETE:
7416     case BATCH_MUTATE:
7417     case COMPACT_REGION:
7418       // when a region is in recovering state, no read, split or merge is allowed
7419       if (isRecovering() && (this.disallowWritesInRecovering ||
7420               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
7421         throw new RegionInRecoveryException(getRegionInfo().getRegionNameAsString() +
7422           " is recovering; cannot take reads");
7423       }
7424       break;
7425     default:
7426       break;
7427     }
7428     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
7429         || op == Operation.COMPACT_REGION) {
7430       // split, merge or compact region doesn't need to check the closing/closed state or lock the
7431       // region
7432       return;
7433     }
7434     if (this.closing.get()) {
7435       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
7436     }
7437     lock(lock.readLock());
7438     if (this.closed.get()) {
7439       lock.readLock().unlock();
7440       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
7441     }
7442     try {
7443       if (coprocessorHost != null) {
7444         coprocessorHost.postStartRegionOperation(op);
7445       }
7446     } catch (Exception e) {
7447       lock.readLock().unlock();
7448       throw new IOException(e);
7449     }
7450   }
7451 
7452   @Override
7453   public void closeRegionOperation() throws IOException {
7454     closeRegionOperation(Operation.ANY);
7455   }
7456 
7457   /**
7458    * Closes the lock. This needs to be called in the finally block corresponding
7459    * to the try block of {@link #startRegionOperation(Operation)}
7460    * @throws IOException
7461    */
7462   public void closeRegionOperation(Operation operation) throws IOException {
7463     lock.readLock().unlock();
7464     if (coprocessorHost != null) {
7465       coprocessorHost.postCloseRegionOperation(operation);
7466     }
7467   }
7468 
7469   /**
7470    * This method needs to be called before any public call that reads or
7471    * modifies stores in bulk. It has to be called just before a try.
7472    * #closeBulkRegionOperation needs to be called in the try's finally block
7473    * Acquires a writelock and checks if the region is closing or closed.
7474    * @throws NotServingRegionException when the region is closing or closed
7475    * @throws RegionTooBusyException if failed to get the lock in time
7476    * @throws InterruptedIOException if interrupted while waiting for a lock
7477    */
7478   private void startBulkRegionOperation(boolean writeLockNeeded)
7479       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
7480     if (this.closing.get()) {
7481       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
7482     }
7483     if (writeLockNeeded) lock(lock.writeLock());
7484     else lock(lock.readLock());
7485     if (this.closed.get()) {
7486       if (writeLockNeeded) lock.writeLock().unlock();
7487       else lock.readLock().unlock();
7488       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
7489     }
7490   }
7491 
7492   /**
7493    * Closes the lock. This needs to be called in the finally block corresponding
7494    * to the try block of #startRegionOperation
7495    */
7496   private void closeBulkRegionOperation(){
7497     if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
7498     else lock.readLock().unlock();
7499   }
7500 
7501   /**
7502    * Update counters for numer of puts without wal and the size of possible data loss.
7503    * These information are exposed by the region server metrics.
7504    */
7505   private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
7506     numMutationsWithoutWAL.increment();
7507     if (numMutationsWithoutWAL.get() <= 1) {
7508       LOG.info("writing data to region " + this +
7509                " with WAL disabled. Data may be lost in the event of a crash.");
7510     }
7511 
7512     long mutationSize = 0;
7513     for (List<Cell> cells: familyMap.values()) {
7514       assert cells instanceof RandomAccess;
7515       int listSize = cells.size();
7516       for (int i=0; i < listSize; i++) {
7517         Cell cell = cells.get(i);
7518         // TODO we need include tags length also here.
7519         mutationSize += KeyValueUtil.keyLength(cell) + cell.getValueLength();
7520       }
7521     }
7522 
7523     dataInMemoryWithoutWAL.add(mutationSize);
7524   }
7525 
7526   private void lock(final Lock lock)
7527       throws RegionTooBusyException, InterruptedIOException {
7528     lock(lock, 1);
7529   }
7530 
7531   /**
7532    * Try to acquire a lock.  Throw RegionTooBusyException
7533    * if failed to get the lock in time. Throw InterruptedIOException
7534    * if interrupted while waiting for the lock.
7535    */
7536   private void lock(final Lock lock, final int multiplier)
7537       throws RegionTooBusyException, InterruptedIOException {
7538     try {
7539       final long waitTime = Math.min(maxBusyWaitDuration,
7540           busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
7541       if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
7542         throw new RegionTooBusyException(
7543             "failed to get a lock in " + waitTime + " ms. " +
7544                 "regionName=" + (this.getRegionInfo() == null ? "unknown" :
7545                 this.getRegionInfo().getRegionNameAsString()) +
7546                 ", server=" + (this.getRegionServerServices() == null ? "unknown" :
7547                 this.getRegionServerServices().getServerName()));
7548       }
7549     } catch (InterruptedException ie) {
7550       LOG.info("Interrupted while waiting for a lock");
7551       InterruptedIOException iie = new InterruptedIOException();
7552       iie.initCause(ie);
7553       throw iie;
7554     }
7555   }
7556 
7557   /**
7558    * Calls sync with the given transaction ID if the region's table is not
7559    * deferring it.
7560    * @param txid should sync up to which transaction
7561    * @throws IOException If anything goes wrong with DFS
7562    */
7563   private void syncOrDefer(long txid, Durability durability) throws IOException {
7564     if (this.getRegionInfo().isMetaRegion()) {
7565       this.wal.sync(txid);
7566     } else {
7567       switch(durability) {
7568       case USE_DEFAULT:
7569         // do what table defaults to
7570         if (shouldSyncWAL()) {
7571           this.wal.sync(txid);
7572         }
7573         break;
7574       case SKIP_WAL:
7575         // nothing do to
7576         break;
7577       case ASYNC_WAL:
7578         // nothing do to
7579         break;
7580       case SYNC_WAL:
7581       case FSYNC_WAL:
7582         // sync the WAL edit (SYNC and FSYNC treated the same for now)
7583         this.wal.sync(txid);
7584         break;
7585       }
7586     }
7587   }
7588 
7589   /**
7590    * Check whether we should sync the wal from the table's durability settings
7591    */
7592   private boolean shouldSyncWAL() {
7593     return durability.ordinal() >  Durability.ASYNC_WAL.ordinal();
7594   }
7595 
7596   /**
7597    * A mocked list implementation - discards all updates.
7598    */
7599   private static final List<Cell> MOCKED_LIST = new AbstractList<Cell>() {
7600 
7601     @Override
7602     public void add(int index, Cell element) {
7603       // do nothing
7604     }
7605 
7606     @Override
7607     public boolean addAll(int index, Collection<? extends Cell> c) {
7608       return false; // this list is never changed as a result of an update
7609     }
7610 
7611     @Override
7612     public KeyValue get(int index) {
7613       throw new UnsupportedOperationException();
7614     }
7615 
7616     @Override
7617     public int size() {
7618       return 0;
7619     }
7620   };
7621 
7622   /**
7623    * Facility for dumping and compacting catalog tables.
7624    * Only does catalog tables since these are only tables we for sure know
7625    * schema on.  For usage run:
7626    * <pre>
7627    *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
7628    * </pre>
7629    * @throws IOException
7630    */
7631   public static void main(String[] args) throws IOException {
7632     if (args.length < 1) {
7633       printUsageAndExit(null);
7634     }
7635     boolean majorCompact = false;
7636     if (args.length > 1) {
7637       if (!args[1].toLowerCase().startsWith("major")) {
7638         printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
7639       }
7640       majorCompact = true;
7641     }
7642     final Path tableDir = new Path(args[0]);
7643     final Configuration c = HBaseConfiguration.create();
7644     final FileSystem fs = FileSystem.get(c);
7645     final Path logdir = new Path(c.get("hbase.tmp.dir"));
7646     final String logname = "wal" + FSUtils.getTableName(tableDir) + System.currentTimeMillis();
7647 
7648     final Configuration walConf = new Configuration(c);
7649     FSUtils.setRootDir(walConf, logdir);
7650     final WALFactory wals = new WALFactory(walConf, null, logname);
7651     try {
7652       processTable(fs, tableDir, wals, c, majorCompact);
7653     } finally {
7654        wals.close();
7655        // TODO: is this still right?
7656        BlockCache bc = new CacheConfig(c).getBlockCache();
7657        if (bc != null) bc.shutdown();
7658     }
7659   }
7660 
7661   @Override
7662   public long getOpenSeqNum() {
7663     return this.openSeqNum;
7664   }
7665 
7666   @Override
7667   public Map<byte[], Long> getMaxStoreSeqId() {
7668     return this.maxSeqIdInStores;
7669   }
7670 
7671   @Override
7672   public long getOldestSeqIdOfStore(byte[] familyName) {
7673     return wal.getEarliestMemstoreSeqNum(getRegionInfo()
7674         .getEncodedNameAsBytes(), familyName);
7675   }
7676 
7677   @Override
7678   public CompactionState getCompactionState() {
7679     boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
7680     return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
7681         : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
7682   }
7683 
7684   public void reportCompactionRequestStart(boolean isMajor){
7685     (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
7686   }
7687 
7688   public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) {
7689     int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
7690 
7691     // metrics
7692     compactionsFinished.incrementAndGet();
7693     compactionNumFilesCompacted.addAndGet(numFiles);
7694     compactionNumBytesCompacted.addAndGet(filesSizeCompacted);
7695 
7696     assert newValue >= 0;
7697   }
7698 
7699   /**
7700    * Do not change this sequence id. See {@link #sequenceId} comment.
7701    * @return sequenceId
7702    */
7703   @VisibleForTesting
7704   public AtomicLong getSequenceId() {
7705     return this.sequenceId;
7706   }
7707 
7708   /**
7709    * sets this region's sequenceId.
7710    * @param value new value
7711    */
7712   private void setSequenceId(long value) {
7713     this.sequenceId.set(value);
7714   }
7715 
7716   @VisibleForTesting class RowLockContext {
7717     private final HashedBytes row;
7718     private final CountDownLatch latch = new CountDownLatch(1);
7719     private final Thread thread;
7720     private int lockCount = 0;
7721 
7722     RowLockContext(HashedBytes row) {
7723       this.row = row;
7724       this.thread = Thread.currentThread();
7725     }
7726 
7727     boolean ownedByCurrentThread() {
7728       return thread == Thread.currentThread();
7729     }
7730 
7731     RowLock newLock() {
7732       lockCount++;
7733       RowLockImpl rl = new RowLockImpl();
7734       rl.setContext(this);
7735       return rl;
7736     }
7737 
7738     @Override
7739     public String toString() {
7740       Thread t = this.thread;
7741       return "Thread=" + (t == null? "null": t.getName()) + ", row=" + this.row +
7742         ", lockCount=" + this.lockCount;
7743     }
7744 
7745     void releaseLock() {
7746       if (!ownedByCurrentThread()) {
7747         throw new IllegalArgumentException("Lock held by thread: " + thread
7748           + " cannot be released by different thread: " + Thread.currentThread());
7749       }
7750       lockCount--;
7751       if (lockCount == 0) {
7752         // no remaining locks by the thread, unlock and allow other threads to access
7753         RowLockContext existingContext = lockedRows.remove(row);
7754         if (existingContext != this) {
7755           throw new RuntimeException(
7756               "Internal row lock state inconsistent, should not happen, row: " + row);
7757         }
7758         latch.countDown();
7759       }
7760     }
7761   }
7762 
7763   public static class RowLockImpl implements RowLock {
7764     private RowLockContext context;
7765     private boolean released = false;
7766 
7767     @VisibleForTesting
7768     public RowLockContext getContext() {
7769       return context;
7770     }
7771 
7772     @VisibleForTesting
7773     public void setContext(RowLockContext context) {
7774       this.context = context;
7775     }
7776 
7777     @Override
7778     public void release() {
7779       if (!released) {
7780         context.releaseLock();
7781       }
7782       released = true;
7783     }
7784   }
7785 
7786   /**
7787    * Append a faked WALEdit in order to get a long sequence number and wal syncer will just ignore
7788    * the WALEdit append later.
7789    * @param wal
7790    * @param cells list of Cells inserted into memstore. Those Cells are passed in order to
7791    *        be updated with right mvcc values(their wal sequence number)
7792    * @return Return the key used appending with no sync and no append.
7793    * @throws IOException
7794    */
7795   private WALKey appendEmptyEdit(final WAL wal, List<Cell> cells) throws IOException {
7796     // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7797     WALKey key = new HLogKey(getRegionInfo().getEncodedNameAsBytes(), getRegionInfo().getTable(),
7798       WALKey.NO_SEQUENCE_ID, 0, null, HConstants.NO_NONCE, HConstants.NO_NONCE);
7799     // Call append but with an empty WALEdit.  The returned seqeunce id will not be associated
7800     // with any edit and we can be sure it went in after all outstanding appends.
7801     wal.append(getTableDesc(), getRegionInfo(), key,
7802       WALEdit.EMPTY_WALEDIT, this.sequenceId, false, cells);
7803     return key;
7804   }
7805 
7806   /**
7807    * {@inheritDoc}
7808    */
7809   @Override
7810   public void onConfigurationChange(Configuration conf) {
7