View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InterruptedIOException;
25  import java.lang.reflect.Constructor;
26  import java.text.ParseException;
27  import java.util.AbstractList;
28  import java.util.ArrayList;
29  import java.util.Arrays;
30  import java.util.Collection;
31  import java.util.Collections;
32  import java.util.HashMap;
33  import java.util.HashSet;
34  import java.util.Iterator;
35  import java.util.List;
36  import java.util.ListIterator;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.NavigableMap;
40  import java.util.NavigableSet;
41  import java.util.RandomAccess;
42  import java.util.Set;
43  import java.util.TreeMap;
44  import java.util.concurrent.Callable;
45  import java.util.concurrent.CompletionService;
46  import java.util.concurrent.ConcurrentHashMap;
47  import java.util.concurrent.ConcurrentMap;
48  import java.util.concurrent.ConcurrentSkipListMap;
49  import java.util.concurrent.ExecutionException;
50  import java.util.concurrent.ExecutorCompletionService;
51  import java.util.concurrent.ExecutorService;
52  import java.util.concurrent.Executors;
53  import java.util.concurrent.Future;
54  import java.util.concurrent.FutureTask;
55  import java.util.concurrent.ThreadFactory;
56  import java.util.concurrent.ThreadPoolExecutor;
57  import java.util.concurrent.TimeUnit;
58  import java.util.concurrent.TimeoutException;
59  import java.util.concurrent.atomic.AtomicBoolean;
60  import java.util.concurrent.atomic.AtomicInteger;
61  import java.util.concurrent.atomic.AtomicLong;
62  import java.util.concurrent.locks.Lock;
63  import java.util.concurrent.locks.ReadWriteLock;
64  import java.util.concurrent.locks.ReentrantReadWriteLock;
65  
66  import org.apache.commons.logging.Log;
67  import org.apache.commons.logging.LogFactory;
68  import org.apache.hadoop.conf.Configuration;
69  import org.apache.hadoop.fs.FileStatus;
70  import org.apache.hadoop.fs.FileSystem;
71  import org.apache.hadoop.fs.Path;
72  import org.apache.hadoop.hbase.ArrayBackedTag;
73  import org.apache.hadoop.hbase.Cell;
74  import org.apache.hadoop.hbase.CellComparator;
75  import org.apache.hadoop.hbase.CellScanner;
76  import org.apache.hadoop.hbase.CellUtil;
77  import org.apache.hadoop.hbase.CompoundConfiguration;
78  import org.apache.hadoop.hbase.DoNotRetryIOException;
79  import org.apache.hadoop.hbase.DroppedSnapshotException;
80  import org.apache.hadoop.hbase.HBaseConfiguration;
81  import org.apache.hadoop.hbase.HColumnDescriptor;
82  import org.apache.hadoop.hbase.HConstants;
83  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
84  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
85  import org.apache.hadoop.hbase.HRegionInfo;
86  import org.apache.hadoop.hbase.HTableDescriptor;
87  import org.apache.hadoop.hbase.KeyValue;
88  import org.apache.hadoop.hbase.KeyValueUtil;
89  import org.apache.hadoop.hbase.NamespaceDescriptor;
90  import org.apache.hadoop.hbase.NotServingRegionException;
91  import org.apache.hadoop.hbase.RegionTooBusyException;
92  import org.apache.hadoop.hbase.ShareableMemory;
93  import org.apache.hadoop.hbase.TableName;
94  import org.apache.hadoop.hbase.Tag;
95  import org.apache.hadoop.hbase.TagRewriteCell;
96  import org.apache.hadoop.hbase.TagType;
97  import org.apache.hadoop.hbase.TagUtil;
98  import org.apache.hadoop.hbase.UnknownScannerException;
99  import org.apache.hadoop.hbase.backup.HFileArchiver;
100 import org.apache.hadoop.hbase.classification.InterfaceAudience;
101 import org.apache.hadoop.hbase.client.Append;
102 import org.apache.hadoop.hbase.client.Delete;
103 import org.apache.hadoop.hbase.client.Durability;
104 import org.apache.hadoop.hbase.client.Get;
105 import org.apache.hadoop.hbase.client.Increment;
106 import org.apache.hadoop.hbase.client.IsolationLevel;
107 import org.apache.hadoop.hbase.client.Mutation;
108 import org.apache.hadoop.hbase.client.Put;
109 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
110 import org.apache.hadoop.hbase.client.Result;
111 import org.apache.hadoop.hbase.client.RowMutations;
112 import org.apache.hadoop.hbase.client.Scan;
113 import org.apache.hadoop.hbase.conf.ConfigurationManager;
114 import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
115 import org.apache.hadoop.hbase.coprocessor.RegionObserver;
116 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
117 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
118 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
119 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
120 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
121 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
122 import org.apache.hadoop.hbase.filter.FilterWrapper;
123 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
124 import org.apache.hadoop.hbase.io.HeapSize;
125 import org.apache.hadoop.hbase.io.TimeRange;
126 import org.apache.hadoop.hbase.io.hfile.BlockCache;
127 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
128 import org.apache.hadoop.hbase.io.hfile.HFile;
129 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
130 import org.apache.hadoop.hbase.ipc.RpcCallContext;
131 import org.apache.hadoop.hbase.ipc.RpcServer;
132 import org.apache.hadoop.hbase.mob.MobUtils;
133 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
134 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
135 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
136 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
137 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
138 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
139 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
140 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
141 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
142 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
143 import org.apache.hadoop.hbase.protobuf.generated.WALProtos;
144 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
145 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor;
146 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
147 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
148 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor;
149 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
150 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.StoreDescriptor;
151 import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
152 import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
153 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
154 import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
155 import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
156 import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
157 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
158 import org.apache.hadoop.hbase.regionserver.wal.ReplayHLogKey;
159 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
160 import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
161 import org.apache.hadoop.hbase.security.User;
162 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
163 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
164 import org.apache.hadoop.hbase.util.ByteStringer;
165 import org.apache.hadoop.hbase.util.Bytes;
166 import org.apache.hadoop.hbase.util.CancelableProgressable;
167 import org.apache.hadoop.hbase.util.ClassSize;
168 import org.apache.hadoop.hbase.util.CompressionTest;
169 import org.apache.hadoop.hbase.util.Counter;
170 import org.apache.hadoop.hbase.util.EncryptionTest;
171 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
172 import org.apache.hadoop.hbase.util.FSTableDescriptors;
173 import org.apache.hadoop.hbase.util.FSUtils;
174 import org.apache.hadoop.hbase.util.HashedBytes;
175 import org.apache.hadoop.hbase.util.Pair;
176 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
177 import org.apache.hadoop.hbase.util.Threads;
178 import org.apache.hadoop.hbase.wal.WAL;
179 import org.apache.hadoop.hbase.wal.WALFactory;
180 import org.apache.hadoop.hbase.wal.WALKey;
181 import org.apache.hadoop.hbase.wal.WALSplitter;
182 import org.apache.hadoop.hbase.wal.WALSplitter.MutationReplay;
183 import org.apache.hadoop.io.MultipleIOException;
184 import org.apache.hadoop.util.StringUtils;
185 import org.apache.htrace.Trace;
186 import org.apache.htrace.TraceScope;
187 
188 import com.google.common.annotations.VisibleForTesting;
189 import com.google.common.base.Optional;
190 import com.google.common.base.Preconditions;
191 import com.google.common.collect.Lists;
192 import com.google.common.collect.Maps;
193 import com.google.common.io.Closeables;
194 import com.google.protobuf.ByteString;
195 import com.google.protobuf.Descriptors;
196 import com.google.protobuf.Message;
197 import com.google.protobuf.RpcCallback;
198 import com.google.protobuf.RpcController;
199 import com.google.protobuf.Service;
200 import com.google.protobuf.TextFormat;
201 
202 @InterfaceAudience.Private
203 public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
204   private static final Log LOG = LogFactory.getLog(HRegion.class);
205 
206   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
207     "hbase.hregion.scan.loadColumnFamiliesOnDemand";
208 
209   /**
210    * Longest time we'll wait on a sequenceid.
211    * Sequenceid comes up out of the WAL subsystem. WAL subsystem can go bad or a test might use
212    * it without cleaning up previous usage properly; generally, a WAL roll is needed. The timeout
213    * is for a latch in WALKey. There is no global accounting of outstanding WALKeys; intentionally
214    * to avoid contention, but it makes it so if an abort or problem, we could be stuck waiting
215    * on the WALKey latch. Revisit.
216    */
217   private final int maxWaitForSeqId;
218   private static final String MAX_WAIT_FOR_SEQ_ID_KEY = "hbase.hregion.max.wait.for.sequenceid.ms";
219   private static final int DEFAULT_MAX_WAIT_FOR_SEQ_ID = 30000;
220 
221   /**
222    * This is the global default value for durability. All tables/mutations not
223    * defining a durability or using USE_DEFAULT will default to this value.
224    */
225   private static final Durability DEFAULT_DURABILITY = Durability.SYNC_WAL;
226 
227   final AtomicBoolean closed = new AtomicBoolean(false);
228 
229   /* Closing can take some time; use the closing flag if there is stuff we don't
230    * want to do while in closing state; e.g. like offer this region up to the
231    * master as a region to close if the carrying regionserver is overloaded.
232    * Once set, it is never cleared.
233    */
234   final AtomicBoolean closing = new AtomicBoolean(false);
235 
236   /**
237    * The max sequence id of flushed data on this region. There is no edit in memory that is
238    * less that this sequence id.
239    */
240   private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
241 
242   /**
243    * Record the sequence id of last flush operation. Can be in advance of
244    * {@link #maxFlushedSeqId} when flushing a single column family. In this case,
245    * {@link #maxFlushedSeqId} will be older than the oldest edit in memory.
246    */
247   private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
248 
249   /**
250    * The sequence id of the last replayed open region event from the primary region. This is used
251    * to skip entries before this due to the possibility of replay edits coming out of order from
252    * replication.
253    */
254   protected volatile long lastReplayedOpenRegionSeqId = -1L;
255   protected volatile long lastReplayedCompactionSeqId = -1L;
256 
257   //////////////////////////////////////////////////////////////////////////////
258   // Members
259   //////////////////////////////////////////////////////////////////////////////
260 
261   // map from a locked row to the context for that lock including:
262   // - CountDownLatch for threads waiting on that row
263   // - the thread that owns the lock (allow reentrancy)
264   // - reference count of (reentrant) locks held by the thread
265   // - the row itself
266   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
267       new ConcurrentHashMap<HashedBytes, RowLockContext>();
268 
269   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
270       Bytes.BYTES_RAWCOMPARATOR);
271 
272   // TODO: account for each registered handler in HeapSize computation
273   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
274 
275   private final AtomicLong memstoreSize = new AtomicLong(0);
276 
277   // Debug possible data loss due to WAL off
278   final Counter numMutationsWithoutWAL = new Counter();
279   final Counter dataInMemoryWithoutWAL = new Counter();
280 
281   // Debug why CAS operations are taking a while.
282   final Counter checkAndMutateChecksPassed = new Counter();
283   final Counter checkAndMutateChecksFailed = new Counter();
284 
285   //Number of requests
286   final Counter readRequestsCount = new Counter();
287   final Counter filteredReadRequestsCount = new Counter();
288   final Counter writeRequestsCount = new Counter();
289 
290   // Number of requests blocked by memstore size.
291   private final Counter blockedRequestsCount = new Counter();
292 
293   // Compaction counters
294   final AtomicLong compactionsFinished = new AtomicLong(0L);
295   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
296   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
297 
298   private final WAL wal;
299   private final HRegionFileSystem fs;
300   protected final Configuration conf;
301   private final Configuration baseConf;
302   private final int rowLockWaitDuration;
303   private CompactedHFilesDischarger compactedFileDischarger;
304   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
305 
306   // The internal wait duration to acquire a lock before read/update
307   // from the region. It is not per row. The purpose of this wait time
308   // is to avoid waiting a long time while the region is busy, so that
309   // we can release the IPC handler soon enough to improve the
310   // availability of the region server. It can be adjusted by
311   // tuning configuration "hbase.busy.wait.duration".
312   final long busyWaitDuration;
313   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
314 
315   // If updating multiple rows in one call, wait longer,
316   // i.e. waiting for busyWaitDuration * # of rows. However,
317   // we can limit the max multiplier.
318   final int maxBusyWaitMultiplier;
319 
320   // Max busy wait duration. There is no point to wait longer than the RPC
321   // purge timeout, when a RPC call will be terminated by the RPC engine.
322   final long maxBusyWaitDuration;
323 
324   // negative number indicates infinite timeout
325   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
326   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
327 
328   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
329 
330   /**
331    * The sequence ID that was encountered when this region was opened.
332    */
333   private long openSeqNum = HConstants.NO_SEQNUM;
334 
335   /**
336    * The default setting for whether to enable on-demand CF loading for
337    * scan requests to this region. Requests can override it.
338    */
339   private boolean isLoadingCfsOnDemandDefault = false;
340 
341   private final AtomicInteger majorInProgress = new AtomicInteger(0);
342   private final AtomicInteger minorInProgress = new AtomicInteger(0);
343 
344   //
345   // Context: During replay we want to ensure that we do not lose any data. So, we
346   // have to be conservative in how we replay wals. For each store, we calculate
347   // the maxSeqId up to which the store was flushed. And, skip the edits which
348   // are equal to or lower than maxSeqId for each store.
349   // The following map is populated when opening the region
350   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
351 
352   /** Saved state from replaying prepare flush cache */
353   private PrepareFlushResult prepareFlushResult = null;
354 
355   /**
356    * Config setting for whether to allow writes when a region is in recovering or not.
357    */
358   private boolean disallowWritesInRecovering = false;
359 
360   // when a region is in recovering state, it can only accept writes not reads
361   private volatile boolean recovering = false;
362 
363   private volatile Optional<ConfigurationManager> configurationManager;
364 
365   /**
366    * @return The smallest mvcc readPoint across all the scanners in this
367    * region. Writes older than this readPoint, are included in every
368    * read operation.
369    */
370   public long getSmallestReadPoint() {
371     long minimumReadPoint;
372     // We need to ensure that while we are calculating the smallestReadPoint
373     // no new RegionScanners can grab a readPoint that we are unaware of.
374     // We achieve this by synchronizing on the scannerReadPoints object.
375     synchronized(scannerReadPoints) {
376       minimumReadPoint = mvcc.getReadPoint();
377 
378       for (Long readPoint: this.scannerReadPoints.values()) {
379         if (readPoint < minimumReadPoint) {
380           minimumReadPoint = readPoint;
381         }
382       }
383     }
384     return minimumReadPoint;
385   }
386 
387   /*
388    * Data structure of write state flags used coordinating flushes,
389    * compactions and closes.
390    */
391   static class WriteState {
392     // Set while a memstore flush is happening.
393     volatile boolean flushing = false;
394     // Set when a flush has been requested.
395     volatile boolean flushRequested = false;
396     // Number of compactions running.
397     AtomicInteger compacting = new AtomicInteger(0);
398     // Gets set in close. If set, cannot compact or flush again.
399     volatile boolean writesEnabled = true;
400     // Set if region is read-only
401     volatile boolean readOnly = false;
402     // whether the reads are enabled. This is different than readOnly, because readOnly is
403     // static in the lifetime of the region, while readsEnabled is dynamic
404     volatile boolean readsEnabled = true;
405 
406     /**
407      * Set flags that make this region read-only.
408      *
409      * @param onOff flip value for region r/o setting
410      */
411     synchronized void setReadOnly(final boolean onOff) {
412       this.writesEnabled = !onOff;
413       this.readOnly = onOff;
414     }
415 
416     boolean isReadOnly() {
417       return this.readOnly;
418     }
419 
420     boolean isFlushRequested() {
421       return this.flushRequested;
422     }
423 
424     void setReadsEnabled(boolean readsEnabled) {
425       this.readsEnabled = readsEnabled;
426     }
427 
428     static final long HEAP_SIZE = ClassSize.align(
429         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
430   }
431 
432   /**
433    * Objects from this class are created when flushing to describe all the different states that
434    * that method ends up in. The Result enum describes those states. The sequence id should only
435    * be specified if the flush was successful, and the failure message should only be specified
436    * if it didn't flush.
437    */
438   public static class FlushResultImpl implements FlushResult {
439     final Result result;
440     final String failureReason;
441     final long flushSequenceId;
442     final boolean wroteFlushWalMarker;
443 
444     /**
445      * Convenience constructor to use when the flush is successful, the failure message is set to
446      * null.
447      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
448      * @param flushSequenceId Generated sequence id that comes right after the edits in the
449      *                        memstores.
450      */
451     FlushResultImpl(Result result, long flushSequenceId) {
452       this(result, flushSequenceId, null, false);
453       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
454           .FLUSHED_COMPACTION_NEEDED;
455     }
456 
457     /**
458      * Convenience constructor to use when we cannot flush.
459      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
460      * @param failureReason Reason why we couldn't flush.
461      */
462     FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
463       this(result, -1, failureReason, wroteFlushMarker);
464       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
465     }
466 
467     /**
468      * Constructor with all the parameters.
469      * @param result Any of the Result.
470      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
471      * @param failureReason Reason why we couldn't flush, or null.
472      */
473     FlushResultImpl(Result result, long flushSequenceId, String failureReason,
474       boolean wroteFlushMarker) {
475       this.result = result;
476       this.flushSequenceId = flushSequenceId;
477       this.failureReason = failureReason;
478       this.wroteFlushWalMarker = wroteFlushMarker;
479     }
480 
481     /**
482      * Convenience method, the equivalent of checking if result is
483      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
484      * @return true if the memstores were flushed, else false.
485      */
486     @Override
487     public boolean isFlushSucceeded() {
488       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
489           .FLUSHED_COMPACTION_NEEDED;
490     }
491 
492     /**
493      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
494      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
495      */
496     @Override
497     public boolean isCompactionNeeded() {
498       return result == Result.FLUSHED_COMPACTION_NEEDED;
499     }
500 
501     @Override
502     public String toString() {
503       return new StringBuilder()
504         .append("flush result:").append(result).append(", ")
505         .append("failureReason:").append(failureReason).append(",")
506         .append("flush seq id").append(flushSequenceId).toString();
507     }
508 
509     @Override
510     public Result getResult() {
511       return result;
512     }
513   }
514 
515   /** A result object from prepare flush cache stage */
516   @VisibleForTesting
517   static class PrepareFlushResult {
518     final FlushResult result; // indicating a failure result from prepare
519     final TreeMap<byte[], StoreFlushContext> storeFlushCtxs;
520     final TreeMap<byte[], List<Path>> committedFiles;
521     final TreeMap<byte[], Long> storeFlushableSize;
522     final long startTime;
523     final long flushOpSeqId;
524     final long flushedSeqId;
525     final long totalFlushableSize;
526 
527     /** Constructs an early exit case */
528     PrepareFlushResult(FlushResult result, long flushSeqId) {
529       this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, 0);
530     }
531 
532     /** Constructs a successful prepare flush result */
533     PrepareFlushResult(
534       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
535       TreeMap<byte[], List<Path>> committedFiles,
536       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
537       long flushedSeqId, long totalFlushableSize) {
538       this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
539         flushSeqId, flushedSeqId, totalFlushableSize);
540     }
541 
542     private PrepareFlushResult(
543       FlushResult result,
544       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
545       TreeMap<byte[], List<Path>> committedFiles,
546       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
547       long flushedSeqId, long totalFlushableSize) {
548       this.result = result;
549       this.storeFlushCtxs = storeFlushCtxs;
550       this.committedFiles = committedFiles;
551       this.storeFlushableSize = storeFlushableSize;
552       this.startTime = startTime;
553       this.flushOpSeqId = flushSeqId;
554       this.flushedSeqId = flushedSeqId;
555       this.totalFlushableSize = totalFlushableSize;
556     }
557 
558     public FlushResult getResult() {
559       return this.result;
560     }
561   }
562 
563   final WriteState writestate = new WriteState();
564 
565   long memstoreFlushSize;
566   final long timestampSlop;
567   final long rowProcessorTimeout;
568 
569   // Last flush time for each Store. Useful when we are flushing for each column
570   private final ConcurrentMap<Store, Long> lastStoreFlushTimeMap =
571       new ConcurrentHashMap<Store, Long>();
572 
573   final RegionServerServices rsServices;
574   private RegionServerAccounting rsAccounting;
575   private long flushCheckInterval;
576   // flushPerChanges is to prevent too many changes in memstore
577   private long flushPerChanges;
578   private long blockingMemStoreSize;
579   final long threadWakeFrequency;
580   // Used to guard closes
581   final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
582 
583   // Stop updates lock
584   private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
585   private boolean splitRequest;
586   private byte[] explicitSplitPoint = null;
587 
588   private final MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
589 
590   // Coprocessor host
591   private RegionCoprocessorHost coprocessorHost;
592 
593   private HTableDescriptor htableDescriptor = null;
594   private RegionSplitPolicy splitPolicy;
595   private FlushPolicy flushPolicy;
596 
597   private final MetricsRegion metricsRegion;
598   private final MetricsRegionWrapperImpl metricsRegionWrapper;
599   private final Durability durability;
600   private final boolean regionStatsEnabled;
601 
602   /**
603    * HRegion constructor. This constructor should only be used for testing and
604    * extensions.  Instances of HRegion should be instantiated with the
605    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
606    *
607    * @param tableDir qualified path of directory where region should be located,
608    * usually the table directory.
609    * @param wal The WAL is the outbound log for any updates to the HRegion
610    * The wal file is a logfile from the previous execution that's
611    * custom-computed for this HRegion. The HRegionServer computes and sorts the
612    * appropriate wal info for this HRegion. If there is a previous wal file
613    * (implying that the HRegion has been written-to before), then read it from
614    * the supplied path.
615    * @param fs is the filesystem.
616    * @param confParam is global configuration settings.
617    * @param regionInfo - HRegionInfo that describes the region
618    * is new), then read them from the supplied path.
619    * @param htd the table descriptor
620    * @param rsServices reference to {@link RegionServerServices} or null
621    * @deprecated Use other constructors.
622    */
623   @Deprecated
624   @VisibleForTesting
625   public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
626       final Configuration confParam, final HRegionInfo regionInfo,
627       final HTableDescriptor htd, final RegionServerServices rsServices) {
628     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
629       wal, confParam, htd, rsServices);
630   }
631 
632   /**
633    * HRegion constructor. This constructor should only be used for testing and
634    * extensions.  Instances of HRegion should be instantiated with the
635    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
636    *
637    * @param fs is the filesystem.
638    * @param wal The WAL is the outbound log for any updates to the HRegion
639    * The wal file is a logfile from the previous execution that's
640    * custom-computed for this HRegion. The HRegionServer computes and sorts the
641    * appropriate wal info for this HRegion. If there is a previous wal file
642    * (implying that the HRegion has been written-to before), then read it from
643    * the supplied path.
644    * @param confParam is global configuration settings.
645    * @param htd the table descriptor
646    * @param rsServices reference to {@link RegionServerServices} or null
647    */
648   public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
649       final HTableDescriptor htd, final RegionServerServices rsServices) {
650     if (htd == null) {
651       throw new IllegalArgumentException("Need table descriptor");
652     }
653 
654     if (confParam instanceof CompoundConfiguration) {
655       throw new IllegalArgumentException("Need original base configuration");
656     }
657 
658     this.wal = wal;
659     this.fs = fs;
660 
661     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
662     this.baseConf = confParam;
663     this.conf = new CompoundConfiguration()
664       .add(confParam)
665       .addStringMap(htd.getConfiguration())
666       .addBytesMap(htd.getValues());
667     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
668         DEFAULT_CACHE_FLUSH_INTERVAL);
669     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
670     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
671       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
672           + MAX_FLUSH_PER_CHANGES);
673     }
674     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
675                     DEFAULT_ROWLOCK_WAIT_DURATION);
676 
677     this.maxWaitForSeqId = conf.getInt(MAX_WAIT_FOR_SEQ_ID_KEY, DEFAULT_MAX_WAIT_FOR_SEQ_ID);
678     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
679     this.htableDescriptor = htd;
680     this.rsServices = rsServices;
681     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
682     setHTableSpecificConf();
683     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
684 
685     this.busyWaitDuration = conf.getLong(
686       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
687     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
688     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
689       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
690         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
691         + maxBusyWaitMultiplier + "). Their product should be positive");
692     }
693     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
694       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
695 
696     /*
697      * timestamp.slop provides a server-side constraint on the timestamp. This
698      * assumes that you base your TS around currentTimeMillis(). In this case,
699      * throw an error to the user if the user-specified TS is newer than now +
700      * slop. LATEST_TIMESTAMP == don't use this functionality
701      */
702     this.timestampSlop = conf.getLong(
703         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
704         HConstants.LATEST_TIMESTAMP);
705 
706     /**
707      * Timeout for the process time in processRowsWithLocks().
708      * Use -1 to switch off time bound.
709      */
710     this.rowProcessorTimeout = conf.getLong(
711         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
712     this.durability = htd.getDurability() == Durability.USE_DEFAULT
713         ? DEFAULT_DURABILITY
714         : htd.getDurability();
715     if (rsServices != null) {
716       this.rsAccounting = this.rsServices.getRegionServerAccounting();
717       // don't initialize coprocessors if not running within a regionserver
718       // TODO: revisit if coprocessors should load in other cases
719       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
720       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
721       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
722 
723       Map<String, Region> recoveringRegions = rsServices.getRecoveringRegions();
724       String encodedName = getRegionInfo().getEncodedName();
725       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
726         this.recovering = true;
727         recoveringRegions.put(encodedName, this);
728       }
729     } else {
730       this.metricsRegionWrapper = null;
731       this.metricsRegion = null;
732     }
733     if (LOG.isDebugEnabled()) {
734       // Write out region name as string and its encoded name.
735       LOG.debug("Instantiated " + this);
736     }
737 
738     // by default, we allow writes against a region when it's in recovering
739     this.disallowWritesInRecovering =
740         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
741           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
742     configurationManager = Optional.absent();
743 
744     // disable stats tracking system tables, but check the config for everything else
745     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
746         NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
747           false :
748           conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
749               HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
750   }
751 
752   void setHTableSpecificConf() {
753     if (this.htableDescriptor == null) return;
754     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
755 
756     if (flushSize <= 0) {
757       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
758         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
759     }
760     this.memstoreFlushSize = flushSize;
761     this.blockingMemStoreSize = this.memstoreFlushSize *
762         conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
763                 HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
764   }
765 
766   /**
767    * Initialize this region.
768    * Used only by tests and SplitTransaction to reopen the region.
769    * You should use createHRegion() or openHRegion()
770    * @return What the next sequence (edit) id should be.
771    * @throws IOException e
772    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
773    */
774   @Deprecated
775   public long initialize() throws IOException {
776     return initialize(null);
777   }
778 
779   /**
780    * Initialize this region.
781    *
782    * @param reporter Tickle every so often if initialize is taking a while.
783    * @return What the next sequence (edit) id should be.
784    * @throws IOException e
785    */
786   private long initialize(final CancelableProgressable reporter) throws IOException {
787     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
788     long nextSeqId = -1;
789     try {
790       nextSeqId = initializeRegionInternals(reporter, status);
791       return nextSeqId;
792     } finally {
793       // nextSeqid will be -1 if the initialization fails.
794       // At least it will be 0 otherwise.
795       if (nextSeqId == -1) {
796         status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() +
797           " initialization.");
798       }
799     }
800   }
801 
802   private long initializeRegionInternals(final CancelableProgressable reporter,
803       final MonitoredTask status) throws IOException {
804     if (coprocessorHost != null) {
805       status.setStatus("Running coprocessor pre-open hook");
806       coprocessorHost.preOpen();
807     }
808 
809     // Write HRI to a file in case we need to recover hbase:meta
810     status.setStatus("Writing region info on filesystem");
811     fs.checkRegionInfoOnFilesystem();
812 
813     // Initialize all the HStores
814     status.setStatus("Initializing all the Stores");
815     long maxSeqId = initializeStores(reporter, status);
816     this.mvcc.advanceTo(maxSeqId);
817     if (ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) {
818       // Recover any edits if available.
819       maxSeqId = Math.max(maxSeqId,
820         replayRecoveredEditsIfAny(this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
821       // Make sure mvcc is up to max.
822       this.mvcc.advanceTo(maxSeqId);
823     }
824     this.lastReplayedOpenRegionSeqId = maxSeqId;
825 
826     this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
827     this.writestate.flushRequested = false;
828     this.writestate.compacting.set(0);
829 
830     if (this.writestate.writesEnabled) {
831       // Remove temporary data left over from old regions
832       status.setStatus("Cleaning up temporary data from old regions");
833       fs.cleanupTempDir();
834     }
835 
836     if (this.writestate.writesEnabled) {
837       status.setStatus("Cleaning up detritus from prior splits");
838       // Get rid of any splits or merges that were lost in-progress.  Clean out
839       // these directories here on open.  We may be opening a region that was
840       // being split but we crashed in the middle of it all.
841       fs.cleanupAnySplitDetritus();
842       fs.cleanupMergesDir();
843     }
844 
845     // Initialize split policy
846     this.splitPolicy = RegionSplitPolicy.create(this, conf);
847 
848     // Initialize flush policy
849     this.flushPolicy = FlushPolicyFactory.create(this, conf);
850 
851     long lastFlushTime = EnvironmentEdgeManager.currentTime();
852     for (Store store: stores.values()) {
853       this.lastStoreFlushTimeMap.put(store, lastFlushTime);
854     }
855 
856     // Use maximum of log sequenceid or that which was found in stores
857     // (particularly if no recovered edits, seqid will be -1).
858     long nextSeqid = maxSeqId;
859 
860     // In distributedLogReplay mode, we don't know the last change sequence number because region
861     // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
862     // overlaps used sequence numbers
863     if (this.writestate.writesEnabled) {
864       nextSeqid = WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs
865           .getRegionDir(), nextSeqid, (this.recovering ? (this.flushPerChanges + 10000000) : 1));
866     } else {
867       nextSeqid++;
868     }
869 
870     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
871       "; next sequenceid=" + nextSeqid);
872 
873     // A region can be reopened if failed a split; reset flags
874     this.closing.set(false);
875     this.closed.set(false);
876 
877     if (coprocessorHost != null) {
878       status.setStatus("Running coprocessor post-open hooks");
879       coprocessorHost.postOpen();
880     }
881 
882     status.markComplete("Region opened successfully");
883     return nextSeqid;
884   }
885 
886   /**
887    * Open all Stores.
888    * @param reporter
889    * @param status
890    * @return Highest sequenceId found out in a Store.
891    * @throws IOException
892    */
893   private long initializeStores(final CancelableProgressable reporter, MonitoredTask status)
894   throws IOException {
895     // Load in all the HStores.
896 
897     long maxSeqId = -1;
898     // initialized to -1 so that we pick up MemstoreTS from column families
899     long maxMemstoreTS = -1;
900 
901     if (!htableDescriptor.getFamilies().isEmpty()) {
902       // initialize the thread pool for opening stores in parallel.
903       ThreadPoolExecutor storeOpenerThreadPool =
904         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
905       CompletionService<HStore> completionService =
906         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
907 
908       // initialize each store in parallel
909       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
910         status.setStatus("Instantiating store for column family " + family);
911         completionService.submit(new Callable<HStore>() {
912           @Override
913           public HStore call() throws IOException {
914             return instantiateHStore(family);
915           }
916         });
917       }
918       boolean allStoresOpened = false;
919       try {
920         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
921           Future<HStore> future = completionService.take();
922           HStore store = future.get();
923           this.stores.put(store.getFamily().getName(), store);
924 
925           long storeMaxSequenceId = store.getMaxSequenceId();
926           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
927               storeMaxSequenceId);
928           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
929             maxSeqId = storeMaxSequenceId;
930           }
931           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
932           if (maxStoreMemstoreTS > maxMemstoreTS) {
933             maxMemstoreTS = maxStoreMemstoreTS;
934           }
935         }
936         allStoresOpened = true;
937       } catch (InterruptedException e) {
938         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
939       } catch (ExecutionException e) {
940         throw new IOException(e.getCause());
941       } finally {
942         storeOpenerThreadPool.shutdownNow();
943         if (!allStoresOpened) {
944           // something went wrong, close all opened stores
945           LOG.error("Could not initialize all stores for the region=" + this);
946           for (Store store : this.stores.values()) {
947             try {
948               store.close();
949             } catch (IOException e) {
950               LOG.warn(e.getMessage());
951             }
952           }
953         }
954       }
955     }
956     return Math.max(maxSeqId, maxMemstoreTS + 1);
957   }
958 
959   private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
960     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
961 
962     // Initialize all the HStores
963     status.setStatus("Warming up all the Stores");
964     initializeStores(reporter, status);
965   }
966 
967   /**
968    * @return Map of StoreFiles by column family
969    */
970   private NavigableMap<byte[], List<Path>> getStoreFiles() {
971     NavigableMap<byte[], List<Path>> allStoreFiles =
972       new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
973     for (Store store: getStores()) {
974       Collection<StoreFile> storeFiles = store.getStorefiles();
975       if (storeFiles == null) continue;
976       List<Path> storeFileNames = new ArrayList<Path>();
977       for (StoreFile storeFile: storeFiles) {
978         storeFileNames.add(storeFile.getPath());
979       }
980       allStoreFiles.put(store.getFamily().getName(), storeFileNames);
981     }
982     return allStoreFiles;
983   }
984 
985   private void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
986     Map<byte[], List<Path>> storeFiles = getStoreFiles();
987     RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
988       RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
989       getRegionServerServices().getServerName(), storeFiles);
990     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionOpenDesc, mvcc);
991   }
992 
993   private void writeRegionCloseMarker(WAL wal) throws IOException {
994     Map<byte[], List<Path>> storeFiles = getStoreFiles();
995     RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
996       RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(),
997       getRegionServerServices().getServerName(), storeFiles);
998     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionEventDesc, mvcc);
999 
1000     // Store SeqId in HDFS when a region closes
1001     // checking region folder exists is due to many tests which delete the table folder while a
1002     // table is still online
1003     if (this.fs.getFileSystem().exists(this.fs.getRegionDir())) {
1004       WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs.getRegionDir(),
1005         mvcc.getReadPoint(), 0);
1006     }
1007   }
1008 
1009   /**
1010    * @return True if this region has references.
1011    */
1012   public boolean hasReferences() {
1013     for (Store store : this.stores.values()) {
1014       if (store.hasReferences()) return true;
1015     }
1016     return false;
1017   }
1018 
1019   @Override
1020   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
1021     HDFSBlocksDistribution hdfsBlocksDistribution =
1022       new HDFSBlocksDistribution();
1023     synchronized (this.stores) {
1024       for (Store store : this.stores.values()) {
1025         Collection<StoreFile> storeFiles = store.getStorefiles();
1026         if (storeFiles == null) continue;
1027         for (StoreFile sf : storeFiles) {
1028           HDFSBlocksDistribution storeFileBlocksDistribution =
1029             sf.getHDFSBlockDistribution();
1030           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
1031         }
1032       }
1033     }
1034     return hdfsBlocksDistribution;
1035   }
1036 
1037   /**
1038    * This is a helper function to compute HDFS block distribution on demand
1039    * @param conf configuration
1040    * @param tableDescriptor HTableDescriptor of the table
1041    * @param regionInfo encoded name of the region
1042    * @return The HDFS blocks distribution for the given region.
1043    * @throws IOException
1044    */
1045   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1046       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
1047     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
1048     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
1049   }
1050 
1051   /**
1052    * This is a helper function to compute HDFS block distribution on demand
1053    * @param conf configuration
1054    * @param tableDescriptor HTableDescriptor of the table
1055    * @param regionInfo encoded name of the region
1056    * @param tablePath the table directory
1057    * @return The HDFS blocks distribution for the given region.
1058    * @throws IOException
1059    */
1060   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1061       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
1062       throws IOException {
1063     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1064     FileSystem fs = tablePath.getFileSystem(conf);
1065 
1066     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
1067     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
1068       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
1069       if (storeFiles == null) continue;
1070       for (StoreFileInfo storeFileInfo : storeFiles) {
1071         try {
1072           hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
1073         } catch (IOException ioe) {
1074           LOG.warn("Error getting hdfs block distribution for " + storeFileInfo);
1075         }
1076       }
1077     }
1078     return hdfsBlocksDistribution;
1079   }
1080 
1081   /**
1082    * Increase the size of mem store in this region and the size of global mem
1083    * store
1084    * @return the size of memstore in this region
1085    */
1086   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
1087     if (this.rsAccounting != null) {
1088       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
1089     }
1090     return this.memstoreSize.addAndGet(memStoreSize);
1091   }
1092 
1093   @Override
1094   public HRegionInfo getRegionInfo() {
1095     return this.fs.getRegionInfo();
1096   }
1097 
1098   /**
1099    * @return Instance of {@link RegionServerServices} used by this HRegion.
1100    * Can be null.
1101    */
1102   RegionServerServices getRegionServerServices() {
1103     return this.rsServices;
1104   }
1105 
1106   @Override
1107   public long getReadRequestsCount() {
1108     return readRequestsCount.get();
1109   }
1110 
1111   @Override
1112   public void updateReadRequestsCount(long i) {
1113     readRequestsCount.add(i);
1114   }
1115 
1116   @Override
1117   public long getFilteredReadRequestsCount() {
1118     return filteredReadRequestsCount.get();
1119   }
1120 
1121   @Override
1122   public long getWriteRequestsCount() {
1123     return writeRequestsCount.get();
1124   }
1125 
1126   @Override
1127   public void updateWriteRequestsCount(long i) {
1128     writeRequestsCount.add(i);
1129   }
1130 
1131   @Override
1132   public long getMemstoreSize() {
1133     return memstoreSize.get();
1134   }
1135 
1136   @Override
1137   public long getNumMutationsWithoutWAL() {
1138     return numMutationsWithoutWAL.get();
1139   }
1140 
1141   @Override
1142   public long getDataInMemoryWithoutWAL() {
1143     return dataInMemoryWithoutWAL.get();
1144   }
1145 
1146   @Override
1147   public long getBlockedRequestsCount() {
1148     return blockedRequestsCount.get();
1149   }
1150 
1151   @Override
1152   public long getCheckAndMutateChecksPassed() {
1153     return checkAndMutateChecksPassed.get();
1154   }
1155 
1156   @Override
1157   public long getCheckAndMutateChecksFailed() {
1158     return checkAndMutateChecksFailed.get();
1159   }
1160 
1161   @Override
1162   public MetricsRegion getMetrics() {
1163     return metricsRegion;
1164   }
1165 
1166   @Override
1167   public boolean isClosed() {
1168     return this.closed.get();
1169   }
1170 
1171   @Override
1172   public boolean isClosing() {
1173     return this.closing.get();
1174   }
1175 
1176   @Override
1177   public boolean isReadOnly() {
1178     return this.writestate.isReadOnly();
1179   }
1180 
1181   /**
1182    * Reset recovering state of current region
1183    */
1184   public void setRecovering(boolean newState) {
1185     boolean wasRecovering = this.recovering;
1186     // before we flip the recovering switch (enabling reads) we should write the region open
1187     // event to WAL if needed
1188     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
1189         && wasRecovering && !newState) {
1190 
1191       // force a flush only if region replication is set up for this region. Otherwise no need.
1192       boolean forceFlush = getTableDesc().getRegionReplication() > 1;
1193 
1194       // force a flush first
1195       MonitoredTask status = TaskMonitor.get().createStatus(
1196         "Flushing region " + this + " because recovery is finished");
1197       try {
1198         if (forceFlush) {
1199           internalFlushcache(status);
1200         }
1201 
1202         status.setStatus("Writing region open event marker to WAL because recovery is finished");
1203         try {
1204           long seqId = openSeqNum;
1205           // obtain a new seqId because we possibly have writes and flushes on top of openSeqNum
1206           if (wal != null) {
1207             seqId = getNextSequenceId(wal);
1208           }
1209           writeRegionOpenMarker(wal, seqId);
1210         } catch (IOException e) {
1211           // We cannot rethrow this exception since we are being called from the zk thread. The
1212           // region has already opened. In this case we log the error, but continue
1213           LOG.warn(getRegionInfo().getEncodedName() + " : was not able to write region opening "
1214               + "event to WAL, continueing", e);
1215         }
1216       } catch (IOException ioe) {
1217         // Distributed log replay semantics does not necessarily require a flush, since the replayed
1218         // data is already written again in the WAL. So failed flush should be fine.
1219         LOG.warn(getRegionInfo().getEncodedName() + " : was not able to flush "
1220             + "event to WAL, continueing", ioe);
1221       } finally {
1222         status.cleanup();
1223       }
1224     }
1225 
1226     this.recovering = newState;
1227     if (wasRecovering && !recovering) {
1228       // Call only when wal replay is over.
1229       coprocessorHost.postLogReplay();
1230     }
1231   }
1232 
1233   @Override
1234   public boolean isRecovering() {
1235     return this.recovering;
1236   }
1237 
1238   @Override
1239   public boolean isAvailable() {
1240     return !isClosed() && !isClosing();
1241   }
1242 
1243   /** @return true if region is splittable */
1244   public boolean isSplittable() {
1245     return isAvailable() && !hasReferences();
1246   }
1247 
1248   /**
1249    * @return true if region is mergeable
1250    */
1251   public boolean isMergeable() {
1252     if (!isAvailable()) {
1253       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1254           + " is not mergeable because it is closing or closed");
1255       return false;
1256     }
1257     if (hasReferences()) {
1258       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1259           + " is not mergeable because it has references");
1260       return false;
1261     }
1262 
1263     return true;
1264   }
1265 
1266   public boolean areWritesEnabled() {
1267     synchronized(this.writestate) {
1268       return this.writestate.writesEnabled;
1269     }
1270   }
1271 
1272   @VisibleForTesting
1273   public MultiVersionConcurrencyControl getMVCC() {
1274     return mvcc;
1275   }
1276 
1277   @Override
1278   public long getMaxFlushedSeqId() {
1279     return maxFlushedSeqId;
1280   }
1281 
1282   @Override
1283   public long getReadPoint(IsolationLevel isolationLevel) {
1284     if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1285       // This scan can read even uncommitted transactions
1286       return Long.MAX_VALUE;
1287     }
1288     return mvcc.getReadPoint();
1289   }
1290 
1291   @Override
1292   public long getReadpoint(IsolationLevel isolationLevel) {
1293     return getReadPoint(isolationLevel);
1294   }
1295 
1296   @Override
1297   public boolean isLoadingCfsOnDemandDefault() {
1298     return this.isLoadingCfsOnDemandDefault;
1299   }
1300 
1301   /**
1302    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1303    * service any more calls.
1304    *
1305    * <p>This method could take some time to execute, so don't call it from a
1306    * time-sensitive thread.
1307    *
1308    * @return Vector of all the storage files that the HRegion's component
1309    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1310    * vector if already closed and null if judged that it should not close.
1311    *
1312    * @throws IOException e
1313    * @throws DroppedSnapshotException Thrown when replay of wal is required
1314    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1315    * caller MUST abort after this.
1316    */
1317   public Map<byte[], List<StoreFile>> close() throws IOException {
1318     return close(false);
1319   }
1320 
1321   private final Object closeLock = new Object();
1322 
1323   /** Conf key for the periodic flush interval */
1324   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1325       "hbase.regionserver.optionalcacheflushinterval";
1326   /** Default interval for the memstore flush */
1327   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1328   /** Default interval for System tables memstore flush */
1329   public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
1330 
1331   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1332   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1333       "hbase.regionserver.flush.per.changes";
1334   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1335   /**
1336    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1337    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1338    */
1339   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1340 
1341   /**
1342    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1343    * Shut down each HStore, don't service any more calls.
1344    *
1345    * This method could take some time to execute, so don't call it from a
1346    * time-sensitive thread.
1347    *
1348    * @param abort true if server is aborting (only during testing)
1349    * @return Vector of all the storage files that the HRegion's component
1350    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1351    * we are not to close at this time or we are already closed.
1352    *
1353    * @throws IOException e
1354    * @throws DroppedSnapshotException Thrown when replay of wal is required
1355    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1356    * caller MUST abort after this.
1357    */
1358   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1359     // Only allow one thread to close at a time. Serialize them so dual
1360     // threads attempting to close will run up against each other.
1361     MonitoredTask status = TaskMonitor.get().createStatus(
1362         "Closing region " + this +
1363         (abort ? " due to abort" : ""));
1364 
1365     status.setStatus("Waiting for close lock");
1366     try {
1367       synchronized (closeLock) {
1368         return doClose(abort, status);
1369       }
1370     } finally {
1371       status.cleanup();
1372     }
1373   }
1374 
1375   /**
1376    * Exposed for some very specific unit tests.
1377    */
1378   @VisibleForTesting
1379   public void setClosing(boolean closing) {
1380     this.closing.set(closing);
1381   }
1382 
1383   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK_EXCEPTION_PATH",
1384       justification="I think FindBugs is confused")
1385   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1386       throws IOException {
1387     if (isClosed()) {
1388       LOG.warn("Region " + this + " already closed");
1389       return null;
1390     }
1391 
1392     if (coprocessorHost != null) {
1393       status.setStatus("Running coprocessor pre-close hooks");
1394       this.coprocessorHost.preClose(abort);
1395     }
1396 
1397     status.setStatus("Disabling compacts and flushes for region");
1398     boolean canFlush = true;
1399     synchronized (writestate) {
1400       // Disable compacting and flushing by background threads for this
1401       // region.
1402       canFlush = !writestate.readOnly;
1403       writestate.writesEnabled = false;
1404       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1405       waitForFlushesAndCompactions();
1406     }
1407     // If we were not just flushing, is it worth doing a preflush...one
1408     // that will clear out of the bulk of the memstore before we put up
1409     // the close flag?
1410     if (!abort && worthPreFlushing() && canFlush) {
1411       status.setStatus("Pre-flushing region before close");
1412       LOG.info("Running close preflush of " + getRegionInfo().getRegionNameAsString());
1413       try {
1414         internalFlushcache(status);
1415       } catch (IOException ioe) {
1416         // Failed to flush the region. Keep going.
1417         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1418       }
1419     }
1420 
1421     // block waiting for the lock for closing
1422     lock.writeLock().lock(); // FindBugs: Complains UL_UNRELEASED_LOCK_EXCEPTION_PATH but seems fine
1423     this.closing.set(true);
1424     status.setStatus("Disabling writes for close");
1425     try {
1426       if (this.isClosed()) {
1427         status.abort("Already got closed by another process");
1428         // SplitTransaction handles the null
1429         return null;
1430       }
1431       LOG.debug("Updates disabled for region " + this);
1432       // Don't flush the cache if we are aborting
1433       if (!abort && canFlush) {
1434         int flushCount = 0;
1435         while (this.memstoreSize.get() > 0) {
1436           try {
1437             if (flushCount++ > 0) {
1438               int actualFlushes = flushCount - 1;
1439               if (actualFlushes > 5) {
1440                 // If we tried 5 times and are unable to clear memory, abort
1441                 // so we do not lose data
1442                 throw new DroppedSnapshotException("Failed clearing memory after " +
1443                   actualFlushes + " attempts on region: " +
1444                     Bytes.toStringBinary(getRegionInfo().getRegionName()));
1445               }
1446               LOG.info("Running extra flush, " + actualFlushes +
1447                 " (carrying snapshot?) " + this);
1448             }
1449             internalFlushcache(status);
1450           } catch (IOException ioe) {
1451             status.setStatus("Failed flush " + this + ", putting online again");
1452             synchronized (writestate) {
1453               writestate.writesEnabled = true;
1454             }
1455             // Have to throw to upper layers.  I can't abort server from here.
1456             throw ioe;
1457           }
1458         }
1459       }
1460 
1461       Map<byte[], List<StoreFile>> result =
1462         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1463       if (!stores.isEmpty()) {
1464         // initialize the thread pool for closing stores in parallel.
1465         ThreadPoolExecutor storeCloserThreadPool =
1466           getStoreOpenAndCloseThreadPool("StoreCloserThread-" +
1467             getRegionInfo().getRegionNameAsString());
1468         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1469           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1470 
1471         // close each store in parallel
1472         for (final Store store : stores.values()) {
1473           long flushableSize = store.getFlushableSize();
1474           if (!(abort || flushableSize == 0 || writestate.readOnly)) {
1475             if (getRegionServerServices() != null) {
1476               getRegionServerServices().abort("Assertion failed while closing store "
1477                 + getRegionInfo().getRegionNameAsString() + " " + store
1478                 + ". flushableSize expected=0, actual= " + flushableSize
1479                 + ". Current memstoreSize=" + getMemstoreSize() + ". Maybe a coprocessor "
1480                 + "operation failed and left the memstore in a partially updated state.", null);
1481             }
1482           }
1483           completionService
1484               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1485                 @Override
1486                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1487                   return new Pair<byte[], Collection<StoreFile>>(
1488                     store.getFamily().getName(), store.close());
1489                 }
1490               });
1491         }
1492         try {
1493           for (int i = 0; i < stores.size(); i++) {
1494             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1495             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1496             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1497             if (familyFiles == null) {
1498               familyFiles = new ArrayList<StoreFile>();
1499               result.put(storeFiles.getFirst(), familyFiles);
1500             }
1501             familyFiles.addAll(storeFiles.getSecond());
1502           }
1503         } catch (InterruptedException e) {
1504           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1505         } catch (ExecutionException e) {
1506           throw new IOException(e.getCause());
1507         } finally {
1508           storeCloserThreadPool.shutdownNow();
1509         }
1510       }
1511 
1512       status.setStatus("Writing region close event to WAL");
1513       if (!abort && wal != null && getRegionServerServices() != null && !writestate.readOnly) {
1514         writeRegionCloseMarker(wal);
1515       }
1516 
1517       this.closed.set(true);
1518       if (!canFlush) {
1519         addAndGetGlobalMemstoreSize(-memstoreSize.get());
1520       } else if (memstoreSize.get() != 0) {
1521         LOG.error("Memstore size is " + memstoreSize.get());
1522       }
1523       if (coprocessorHost != null) {
1524         status.setStatus("Running coprocessor post-close hooks");
1525         this.coprocessorHost.postClose(abort);
1526       }
1527       if (this.metricsRegion != null) {
1528         this.metricsRegion.close();
1529       }
1530       if (this.metricsRegionWrapper != null) {
1531         Closeables.closeQuietly(this.metricsRegionWrapper);
1532       }
1533       // stop the Compacted hfile discharger
1534       if (this.compactedFileDischarger != null) this.compactedFileDischarger.cancel(true);
1535 
1536       status.markComplete("Closed");
1537       LOG.info("Closed " + this);
1538       return result;
1539     } finally {
1540       lock.writeLock().unlock();
1541     }
1542   }
1543 
1544   @Override
1545   public void waitForFlushesAndCompactions() {
1546     synchronized (writestate) {
1547       if (this.writestate.readOnly) {
1548         // we should not wait for replayed flushed if we are read only (for example in case the
1549         // region is a secondary replica).
1550         return;
1551       }
1552       boolean interrupted = false;
1553       try {
1554         while (writestate.compacting.get() > 0 || writestate.flushing) {
1555           LOG.debug("waiting for " + writestate.compacting + " compactions"
1556             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1557           try {
1558             writestate.wait();
1559           } catch (InterruptedException iex) {
1560             // essentially ignore and propagate the interrupt back up
1561             LOG.warn("Interrupted while waiting");
1562             interrupted = true;
1563           }
1564         }
1565       } finally {
1566         if (interrupted) {
1567           Thread.currentThread().interrupt();
1568         }
1569       }
1570     }
1571   }
1572 
1573   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1574       final String threadNamePrefix) {
1575     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1576     int maxThreads = Math.min(numStores,
1577         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1578             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1579     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1580   }
1581 
1582   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1583       final String threadNamePrefix) {
1584     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1585     int maxThreads = Math.max(1,
1586         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1587             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1588             / numStores);
1589     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1590   }
1591 
1592   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1593       final String threadNamePrefix) {
1594     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1595       new ThreadFactory() {
1596         private int count = 1;
1597 
1598         @Override
1599         public Thread newThread(Runnable r) {
1600           return new Thread(r, threadNamePrefix + "-" + count++);
1601         }
1602       });
1603   }
1604 
1605    /**
1606     * @return True if its worth doing a flush before we put up the close flag.
1607     */
1608   private boolean worthPreFlushing() {
1609     return this.memstoreSize.get() >
1610       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1611   }
1612 
1613   //////////////////////////////////////////////////////////////////////////////
1614   // HRegion accessors
1615   //////////////////////////////////////////////////////////////////////////////
1616 
1617   @Override
1618   public HTableDescriptor getTableDesc() {
1619     return this.htableDescriptor;
1620   }
1621 
1622   /** @return WAL in use for this region */
1623   public WAL getWAL() {
1624     return this.wal;
1625   }
1626 
1627   /**
1628    * @return split policy for this region.
1629    */
1630   public RegionSplitPolicy getSplitPolicy() {
1631     return this.splitPolicy;
1632   }
1633 
1634   /**
1635    * A split takes the config from the parent region & passes it to the daughter
1636    * region's constructor. If 'conf' was passed, you would end up using the HTD
1637    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1638    * to the daughter regions to avoid this tricky dedupe problem.
1639    * @return Configuration object
1640    */
1641   Configuration getBaseConf() {
1642     return this.baseConf;
1643   }
1644 
1645   /** @return {@link FileSystem} being used by this region */
1646   public FileSystem getFilesystem() {
1647     return fs.getFileSystem();
1648   }
1649 
1650   /** @return the {@link HRegionFileSystem} used by this region */
1651   public HRegionFileSystem getRegionFileSystem() {
1652     return this.fs;
1653   }
1654 
1655   @Override
1656   public long getEarliestFlushTimeForAllStores() {
1657     return lastStoreFlushTimeMap.isEmpty() ? Long.MAX_VALUE : Collections.min(lastStoreFlushTimeMap
1658         .values());
1659   }
1660 
1661   @Override
1662   public long getOldestHfileTs(boolean majorCompactioOnly) throws IOException {
1663     long result = Long.MAX_VALUE;
1664     for (Store store : getStores()) {
1665       Collection<StoreFile> storeFiles = store.getStorefiles();
1666       if (storeFiles == null) continue;
1667       for (StoreFile file : storeFiles) {
1668         StoreFile.Reader sfReader = file.getReader();
1669         if (sfReader == null) continue;
1670         HFile.Reader reader = sfReader.getHFileReader();
1671         if (reader == null) continue;
1672         if (majorCompactioOnly) {
1673           byte[] val = reader.loadFileInfo().get(StoreFile.MAJOR_COMPACTION_KEY);
1674           if (val == null) continue;
1675           if (val == null || !Bytes.toBoolean(val)) {
1676             continue;
1677           }
1678         }
1679         result = Math.min(result, reader.getFileContext().getFileCreateTime());
1680       }
1681     }
1682     return result == Long.MAX_VALUE ? 0 : result;
1683   }
1684 
1685   RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
1686     long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
1687     byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
1688     regionLoadBldr.clearStoreCompleteSequenceId();
1689     for (byte[] familyName : this.stores.keySet()) {
1690       long earliest = this.wal.getEarliestMemstoreSeqNum(encodedRegionName, familyName);
1691       // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will
1692       // give us a sequence id that is for sure flushed. We want edit replay to start after this
1693       // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id.
1694       long csid = (earliest == HConstants.NO_SEQNUM)? lastFlushOpSeqIdLocal: earliest - 1;
1695       regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.
1696         newBuilder().setFamilyName(ByteString.copyFrom(familyName)).setSequenceId(csid).build());
1697     }
1698     return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId());
1699   }
1700 
1701   //////////////////////////////////////////////////////////////////////////////
1702   // HRegion maintenance.
1703   //
1704   // These methods are meant to be called periodically by the HRegionServer for
1705   // upkeep.
1706   //////////////////////////////////////////////////////////////////////////////
1707 
1708   /** @return returns size of largest HStore. */
1709   public long getLargestHStoreSize() {
1710     long size = 0;
1711     for (Store h : stores.values()) {
1712       long storeSize = h.getSize();
1713       if (storeSize > size) {
1714         size = storeSize;
1715       }
1716     }
1717     return size;
1718   }
1719 
1720   /*
1721    * Do preparation for pending compaction.
1722    * @throws IOException
1723    */
1724   protected void doRegionCompactionPrep() throws IOException {
1725   }
1726 
1727   @Override
1728   public void triggerMajorCompaction() throws IOException {
1729     for (Store s : getStores()) {
1730       s.triggerMajorCompaction();
1731     }
1732   }
1733 
1734   @Override
1735   public void compact(final boolean majorCompaction) throws IOException {
1736     if (majorCompaction) {
1737       triggerMajorCompaction();
1738     }
1739     for (Store s : getStores()) {
1740       CompactionContext compaction = s.requestCompaction();
1741       if (compaction != null) {
1742         ThroughputController controller = null;
1743         if (rsServices != null) {
1744           controller = CompactionThroughputControllerFactory.create(rsServices, conf);
1745         }
1746         if (controller == null) {
1747           controller = NoLimitThroughputController.INSTANCE;
1748         }
1749         compact(compaction, s, controller, null);
1750       }
1751     }
1752   }
1753 
1754   /**
1755    * This is a helper function that compact all the stores synchronously
1756    * It is used by utilities and testing
1757    *
1758    * @throws IOException e
1759    */
1760   public void compactStores() throws IOException {
1761     for (Store s : getStores()) {
1762       CompactionContext compaction = s.requestCompaction();
1763       if (compaction != null) {
1764         compact(compaction, s, NoLimitThroughputController.INSTANCE, null);
1765       }
1766     }
1767   }
1768 
1769   /**
1770    * This is a helper function that compact the given store
1771    * It is used by utilities and testing
1772    *
1773    * @throws IOException e
1774    */
1775   @VisibleForTesting
1776   void compactStore(byte[] family, ThroughputController throughputController)
1777       throws IOException {
1778     Store s = getStore(family);
1779     CompactionContext compaction = s.requestCompaction();
1780     if (compaction != null) {
1781       compact(compaction, s, throughputController, null);
1782     }
1783   }
1784 
1785   /*
1786    * Called by compaction thread and after region is opened to compact the
1787    * HStores if necessary.
1788    *
1789    * <p>This operation could block for a long time, so don't call it from a
1790    * time-sensitive thread.
1791    *
1792    * Note that no locking is necessary at this level because compaction only
1793    * conflicts with a region split, and that cannot happen because the region
1794    * server does them sequentially and not in parallel.
1795    *
1796    * @param compaction Compaction details, obtained by requestCompaction()
1797    * @param throughputController
1798    * @return whether the compaction completed
1799    */
1800   public boolean compact(CompactionContext compaction, Store store,
1801       ThroughputController throughputController) throws IOException {
1802     return compact(compaction, store, throughputController, null);
1803   }
1804 
1805   public boolean compact(CompactionContext compaction, Store store,
1806       ThroughputController throughputController, User user) throws IOException {
1807     assert compaction != null && compaction.hasSelection();
1808     assert !compaction.getRequest().getFiles().isEmpty();
1809     if (this.closing.get() || this.closed.get()) {
1810       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1811       store.cancelRequestedCompaction(compaction);
1812       return false;
1813     }
1814     MonitoredTask status = null;
1815     boolean requestNeedsCancellation = true;
1816     /*
1817      * We are trying to remove / relax the region read lock for compaction.
1818      * Let's see what are the potential race conditions among the operations (user scan,
1819      * region split, region close and region bulk load).
1820      *
1821      *  user scan ---> region read lock
1822      *  region split --> region close first --> region write lock
1823      *  region close --> region write lock
1824      *  region bulk load --> region write lock
1825      *
1826      * read lock is compatible with read lock. ---> no problem with user scan/read
1827      * region bulk load does not cause problem for compaction (no consistency problem, store lock
1828      *  will help the store file accounting).
1829      * They can run almost concurrently at the region level.
1830      *
1831      * The only remaining race condition is between the region close and compaction.
1832      * So we will evaluate, below, how region close intervenes with compaction if compaction does
1833      * not acquire region read lock.
1834      *
1835      * Here are the steps for compaction:
1836      * 1. obtain list of StoreFile's
1837      * 2. create StoreFileScanner's based on list from #1
1838      * 3. perform compaction and save resulting files under tmp dir
1839      * 4. swap in compacted files
1840      *
1841      * #1 is guarded by store lock. This patch does not change this --> no worse or better
1842      * For #2, we obtain smallest read point (for region) across all the Scanners (for both default
1843      * compactor and stripe compactor).
1844      * The read points are for user scans. Region keeps the read points for all currently open
1845      * user scanners.
1846      * Compaction needs to know the smallest read point so that during re-write of the hfiles,
1847      * it can remove the mvcc points for the cells if their mvccs are older than the smallest
1848      * since they are not needed anymore.
1849      * This will not conflict with compaction.
1850      * For #3, it can be performed in parallel to other operations.
1851      * For #4 bulk load and compaction don't conflict with each other on the region level
1852      *   (for multi-family atomicy).
1853      * Region close and compaction are guarded pretty well by the 'writestate'.
1854      * In HRegion#doClose(), we have :
1855      * synchronized (writestate) {
1856      *   // Disable compacting and flushing by background threads for this
1857      *   // region.
1858      *   canFlush = !writestate.readOnly;
1859      *   writestate.writesEnabled = false;
1860      *   LOG.debug("Closing " + this + ": disabling compactions & flushes");
1861      *   waitForFlushesAndCompactions();
1862      * }
1863      * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0.
1864      * and in HRegion.compact()
1865      *  try {
1866      *    synchronized (writestate) {
1867      *    if (writestate.writesEnabled) {
1868      *      wasStateSet = true;
1869      *      ++writestate.compacting;
1870      *    } else {
1871      *      String msg = "NOT compacting region " + this + ". Writes disabled.";
1872      *      LOG.info(msg);
1873      *      status.abort(msg);
1874      *      return false;
1875      *    }
1876      *  }
1877      * Also in compactor.performCompaction():
1878      * check periodically to see if a system stop is requested
1879      * if (closeCheckInterval > 0) {
1880      *   bytesWritten += len;
1881      *   if (bytesWritten > closeCheckInterval) {
1882      *     bytesWritten = 0;
1883      *     if (!store.areWritesEnabled()) {
1884      *       progress.cancel();
1885      *       return false;
1886      *     }
1887      *   }
1888      * }
1889      */
1890     try {
1891       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1892       if (stores.get(cf) != store) {
1893         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1894             + " has been re-instantiated, cancel this compaction request. "
1895             + " It may be caused by the roll back of split transaction");
1896         return false;
1897       }
1898 
1899       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1900       if (this.closed.get()) {
1901         String msg = "Skipping compaction on " + this + " because closed";
1902         LOG.debug(msg);
1903         status.abort(msg);
1904         return false;
1905       }
1906       boolean wasStateSet = false;
1907       try {
1908         synchronized (writestate) {
1909           if (writestate.writesEnabled) {
1910             wasStateSet = true;
1911             writestate.compacting.incrementAndGet();
1912           } else {
1913             String msg = "NOT compacting region " + this + ". Writes disabled.";
1914             LOG.info(msg);
1915             status.abort(msg);
1916             return false;
1917           }
1918         }
1919         LOG.info("Starting compaction on " + store + " in region " + this
1920             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1921         doRegionCompactionPrep();
1922         try {
1923           status.setStatus("Compacting store " + store);
1924           // We no longer need to cancel the request on the way out of this
1925           // method because Store#compact will clean up unconditionally
1926           requestNeedsCancellation = false;
1927           store.compact(compaction, throughputController, user);
1928         } catch (InterruptedIOException iioe) {
1929           String msg = "compaction interrupted";
1930           LOG.info(msg, iioe);
1931           status.abort(msg);
1932           return false;
1933         }
1934       } finally {
1935         if (wasStateSet) {
1936           synchronized (writestate) {
1937             writestate.compacting.decrementAndGet();
1938             if (writestate.compacting.get() <= 0) {
1939               writestate.notifyAll();
1940             }
1941           }
1942         }
1943       }
1944       status.markComplete("Compaction complete");
1945       return true;
1946     } finally {
1947       if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
1948       if (status != null) status.cleanup();
1949     }
1950   }
1951 
1952   @Override
1953   public FlushResult flush(boolean force) throws IOException {
1954     return flushcache(force, false);
1955   }
1956 
1957   /**
1958    * Flush the cache.
1959    *
1960    * When this method is called the cache will be flushed unless:
1961    * <ol>
1962    *   <li>the cache is empty</li>
1963    *   <li>the region is closed.</li>
1964    *   <li>a flush is already in progress</li>
1965    *   <li>writes are disabled</li>
1966    * </ol>
1967    *
1968    * <p>This method may block for some time, so it should not be called from a
1969    * time-sensitive thread.
1970    * @param forceFlushAllStores whether we want to flush all stores
1971    * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
1972    * @return whether the flush is success and whether the region needs compacting
1973    *
1974    * @throws IOException general io exceptions
1975    * @throws DroppedSnapshotException Thrown when replay of wal is required
1976    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1977    * caller MUST abort after this.
1978    */
1979   public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
1980       throws IOException {
1981     // fail-fast instead of waiting on the lock
1982     if (this.closing.get()) {
1983       String msg = "Skipping flush on " + this + " because closing";
1984       LOG.debug(msg);
1985       return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
1986     }
1987     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
1988     status.setStatus("Acquiring readlock on region");
1989     // block waiting for the lock for flushing cache
1990     lock.readLock().lock();
1991     try {
1992       if (this.closed.get()) {
1993         String msg = "Skipping flush on " + this + " because closed";
1994         LOG.debug(msg);
1995         status.abort(msg);
1996         return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
1997       }
1998       if (coprocessorHost != null) {
1999         status.setStatus("Running coprocessor pre-flush hooks");
2000         coprocessorHost.preFlush();
2001       }
2002       // TODO: this should be managed within memstore with the snapshot, updated only after flush
2003       // successful
2004       if (numMutationsWithoutWAL.get() > 0) {
2005         numMutationsWithoutWAL.set(0);
2006         dataInMemoryWithoutWAL.set(0);
2007       }
2008       synchronized (writestate) {
2009         if (!writestate.flushing && writestate.writesEnabled) {
2010           this.writestate.flushing = true;
2011         } else {
2012           if (LOG.isDebugEnabled()) {
2013             LOG.debug("NOT flushing memstore for region " + this
2014                 + ", flushing=" + writestate.flushing + ", writesEnabled="
2015                 + writestate.writesEnabled);
2016           }
2017           String msg = "Not flushing since "
2018               + (writestate.flushing ? "already flushing"
2019               : "writes not enabled");
2020           status.abort(msg);
2021           return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2022         }
2023       }
2024 
2025       try {
2026         Collection<Store> specificStoresToFlush =
2027             forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
2028         FlushResult fs = internalFlushcache(specificStoresToFlush,
2029           status, writeFlushRequestWalMarker);
2030 
2031         if (coprocessorHost != null) {
2032           status.setStatus("Running post-flush coprocessor hooks");
2033           coprocessorHost.postFlush();
2034         }
2035 
2036         status.markComplete("Flush successful");
2037         return fs;
2038       } finally {
2039         synchronized (writestate) {
2040           writestate.flushing = false;
2041           this.writestate.flushRequested = false;
2042           writestate.notifyAll();
2043         }
2044       }
2045     } finally {
2046       lock.readLock().unlock();
2047       status.cleanup();
2048     }
2049   }
2050 
2051   /**
2052    * Should the store be flushed because it is old enough.
2053    * <p>
2054    * Every FlushPolicy should call this to determine whether a store is old enough to flush(except
2055    * that you always flush all stores). Otherwise the {@link #shouldFlush()} method will always
2056    * returns true which will make a lot of flush requests.
2057    */
2058   boolean shouldFlushStore(Store store) {
2059     long earliest = this.wal.getEarliestMemstoreSeqNum(getRegionInfo().getEncodedNameAsBytes(),
2060       store.getFamily().getName()) - 1;
2061     if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) {
2062       if (LOG.isDebugEnabled()) {
2063         LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " +
2064           getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest +
2065           " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint());
2066       }
2067       return true;
2068     }
2069     if (this.flushCheckInterval <= 0) {
2070       return false;
2071     }
2072     long now = EnvironmentEdgeManager.currentTime();
2073     if (store.timeOfOldestEdit() < now - this.flushCheckInterval) {
2074       if (LOG.isDebugEnabled()) {
2075         LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " +
2076           getRegionInfo().getEncodedName() + " because time of oldest edit=" +
2077             store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now);
2078       }
2079       return true;
2080     }
2081     return false;
2082   }
2083 
2084   /**
2085    * Should the memstore be flushed now
2086    */
2087   boolean shouldFlush(final StringBuffer whyFlush) {
2088     whyFlush.setLength(0);
2089     // This is a rough measure.
2090     if (this.maxFlushedSeqId > 0
2091           && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint())) {
2092       whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush");
2093       return true;
2094     }
2095     long modifiedFlushCheckInterval = flushCheckInterval;
2096     if (getRegionInfo().isSystemTable() &&
2097         getRegionInfo().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2098       modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL;
2099     }
2100     if (modifiedFlushCheckInterval <= 0) { //disabled
2101       return false;
2102     }
2103     long now = EnvironmentEdgeManager.currentTime();
2104     //if we flushed in the recent past, we don't need to do again now
2105     if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
2106       return false;
2107     }
2108     //since we didn't flush in the recent past, flush now if certain conditions
2109     //are met. Return true on first such memstore hit.
2110     for (Store s : getStores()) {
2111       if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
2112         // we have an old enough edit in the memstore, flush
2113         whyFlush.append(s.toString() + " has an old edit so flush to free WALs");
2114         return true;
2115       }
2116     }
2117     return false;
2118   }
2119 
2120   /**
2121    * Flushing all stores.
2122    *
2123    * @see #internalFlushcache(Collection, MonitoredTask, boolean)
2124    */
2125   private FlushResult internalFlushcache(MonitoredTask status)
2126       throws IOException {
2127     return internalFlushcache(stores.values(), status, false);
2128   }
2129 
2130   /**
2131    * Flushing given stores.
2132    *
2133    * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean)
2134    */
2135   private FlushResult internalFlushcache(final Collection<Store> storesToFlush,
2136       MonitoredTask status, boolean writeFlushWalMarker) throws IOException {
2137     return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush,
2138         status, writeFlushWalMarker);
2139   }
2140 
2141   /**
2142    * Flush the memstore. Flushing the memstore is a little tricky. We have a lot
2143    * of updates in the memstore, all of which have also been written to the wal.
2144    * We need to write those updates in the memstore out to disk, while being
2145    * able to process reads/writes as much as possible during the flush
2146    * operation.
2147    * <p>
2148    * This method may block for some time. Every time you call it, we up the
2149    * regions sequence id even if we don't flush; i.e. the returned region id
2150    * will be at least one larger than the last edit applied to this region. The
2151    * returned id does not refer to an actual edit. The returned id can be used
2152    * for say installing a bulk loaded file just ahead of the last hfile that was
2153    * the result of this flush, etc.
2154    *
2155    * @param wal
2156    *          Null if we're NOT to go via wal.
2157    * @param myseqid
2158    *          The seqid to use if <code>wal</code> is null writing out flush
2159    *          file.
2160    * @param storesToFlush
2161    *          The list of stores to flush.
2162    * @return object describing the flush's state
2163    * @throws IOException
2164    *           general io exceptions
2165    * @throws DroppedSnapshotException
2166    *           Thrown when replay of wal is required because a Snapshot was not
2167    *           properly persisted.
2168    */
2169   protected FlushResult internalFlushcache(final WAL wal, final long myseqid,
2170       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2171           throws IOException {
2172     PrepareFlushResult result
2173       = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker);
2174     if (result.result == null) {
2175       return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
2176     } else {
2177       return result.result; // early exit due to failure from prepare stage
2178     }
2179   }
2180 
2181   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DLS_DEAD_LOCAL_STORE",
2182       justification="FindBugs seems confused about trxId")
2183   protected PrepareFlushResult internalPrepareFlushCache(final WAL wal, final long myseqid,
2184       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2185   throws IOException {
2186     if (this.rsServices != null && this.rsServices.isAborted()) {
2187       // Don't flush when server aborting, it's unsafe
2188       throw new IOException("Aborting flush because server is aborted...");
2189     }
2190     final long startTime = EnvironmentEdgeManager.currentTime();
2191     // If nothing to flush, return, but we need to safely update the region sequence id
2192     if (this.memstoreSize.get() <= 0) {
2193       // Take an update lock because am about to change the sequence id and we want the sequence id
2194       // to be at the border of the empty memstore.
2195       MultiVersionConcurrencyControl.WriteEntry writeEntry = null;
2196       this.updatesLock.writeLock().lock();
2197       try {
2198         if (this.memstoreSize.get() <= 0) {
2199           // Presume that if there are still no edits in the memstore, then there are no edits for
2200           // this region out in the WAL subsystem so no need to do any trickery clearing out
2201           // edits in the WAL system. Up the sequence number so the resulting flush id is for
2202           // sure just beyond the last appended region edit (useful as a marker when bulk loading,
2203           // etc.). NOTE: The writeEntry write number is NOT in the WAL.. there is no WAL writing
2204           // here.
2205           if (wal != null) {
2206             writeEntry = mvcc.begin();
2207             long flushOpSeqId = writeEntry.getWriteNumber();
2208             FlushResult flushResult = new FlushResultImpl(
2209                 FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
2210                 flushOpSeqId,
2211                 "Nothing to flush",
2212                 writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
2213             // TODO: Lets see if we hang here, if there is a scenario where an outstanding reader
2214             // with a read point is in advance of this write point.
2215             mvcc.completeAndWait(writeEntry);
2216             writeEntry = null;
2217             return new PrepareFlushResult(flushResult, myseqid);
2218           } else {
2219             return new PrepareFlushResult(
2220               new FlushResultImpl(
2221                   FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
2222                   "Nothing to flush",
2223                   false),
2224               myseqid);
2225           }
2226         }
2227       } finally {
2228         this.updatesLock.writeLock().unlock();
2229         if (writeEntry != null) {
2230           mvcc.complete(writeEntry);
2231         }
2232       }
2233     }
2234 
2235     if (LOG.isInfoEnabled()) {
2236       // Log a fat line detailing what is being flushed.
2237       StringBuilder perCfExtras = null;
2238       if (!isAllFamilies(storesToFlush)) {
2239         perCfExtras = new StringBuilder();
2240         for (Store store: storesToFlush) {
2241           perCfExtras.append("; ").append(store.getColumnFamilyName());
2242           perCfExtras.append("=").append(StringUtils.byteDesc(store.getMemStoreSize()));
2243         }
2244       }
2245       LOG.info("Flushing " + + storesToFlush.size() + "/" + stores.size() +
2246         " column families, memstore=" + StringUtils.byteDesc(this.memstoreSize.get()) +
2247         ((perCfExtras != null && perCfExtras.length() > 0)? perCfExtras.toString(): "") +
2248         ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + myseqid));
2249     }
2250     // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
2251     // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
2252     // allow updates again so its value will represent the size of the updates received
2253     // during flush
2254 
2255     // We have to take an update lock during snapshot, or else a write could end up in both snapshot
2256     // and memstore (makes it difficult to do atomic rows then)
2257     status.setStatus("Obtaining lock to block concurrent updates");
2258     // block waiting for the lock for internal flush
2259     this.updatesLock.writeLock().lock();
2260     status.setStatus("Preparing to flush by snapshotting stores in " +
2261       getRegionInfo().getEncodedName());
2262     long totalFlushableSizeOfFlushableStores = 0;
2263 
2264     Set<byte[]> flushedFamilyNames = new HashSet<byte[]>();
2265     for (Store store: storesToFlush) {
2266       flushedFamilyNames.add(store.getFamily().getName());
2267     }
2268 
2269     TreeMap<byte[], StoreFlushContext> storeFlushCtxs
2270       = new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
2271     TreeMap<byte[], List<Path>> committedFiles = new TreeMap<byte[], List<Path>>(
2272         Bytes.BYTES_COMPARATOR);
2273     TreeMap<byte[], Long> storeFlushableSize
2274         = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
2275     // The sequence id of this flush operation which is used to log FlushMarker and pass to
2276     // createFlushContext to use as the store file's sequence id. It can be in advance of edits
2277     // still in the memstore, edits that are in other column families yet to be flushed.
2278     long flushOpSeqId = HConstants.NO_SEQNUM;
2279     // The max flushed sequence id after this flush operation completes. All edits in memstore
2280     // will be in advance of this sequence id.
2281     long flushedSeqId = HConstants.NO_SEQNUM;
2282     byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
2283 
2284     long trxId = 0;
2285     MultiVersionConcurrencyControl.WriteEntry writeEntry = mvcc.begin();
2286     try {
2287       try {
2288         if (wal != null) {
2289           Long earliestUnflushedSequenceIdForTheRegion =
2290             wal.startCacheFlush(encodedRegionName, flushedFamilyNames);
2291           if (earliestUnflushedSequenceIdForTheRegion == null) {
2292             // This should never happen. This is how startCacheFlush signals flush cannot proceed.
2293             String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
2294             status.setStatus(msg);
2295             return new PrepareFlushResult(
2296               new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
2297               myseqid);
2298           }
2299           flushOpSeqId = getNextSequenceId(wal);
2300           // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit
2301           flushedSeqId =
2302             earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM?
2303               flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
2304         } else {
2305           // use the provided sequence Id as WAL is not being used for this flush.
2306           flushedSeqId = flushOpSeqId = myseqid;
2307         }
2308 
2309         for (Store s : storesToFlush) {
2310           totalFlushableSizeOfFlushableStores += s.getFlushableSize();
2311           storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId));
2312           committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL
2313           storeFlushableSize.put(s.getFamily().getName(), s.getFlushableSize());
2314         }
2315 
2316         // write the snapshot start to WAL
2317         if (wal != null && !writestate.readOnly) {
2318           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
2319             getRegionInfo(), flushOpSeqId, committedFiles);
2320           // no sync. Sync is below where we do not hold the updates lock
2321           trxId = WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2322             desc, false, mvcc);
2323         }
2324 
2325         // Prepare flush (take a snapshot)
2326         for (StoreFlushContext flush : storeFlushCtxs.values()) {
2327           flush.prepare();
2328         }
2329       } catch (IOException ex) {
2330         if (wal != null) {
2331           if (trxId > 0) { // check whether we have already written START_FLUSH to WAL
2332             try {
2333               FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2334                 getRegionInfo(), flushOpSeqId, committedFiles);
2335               WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2336                 desc, false, mvcc);
2337             } catch (Throwable t) {
2338               LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
2339                   StringUtils.stringifyException(t));
2340               // ignore this since we will be aborting the RS with DSE.
2341             }
2342           }
2343           // we have called wal.startCacheFlush(), now we have to abort it
2344           wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2345           throw ex; // let upper layers deal with it.
2346         }
2347       } finally {
2348         this.updatesLock.writeLock().unlock();
2349       }
2350       String s = "Finished memstore snapshotting " + this +
2351         ", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSizeOfFlushableStores;
2352       status.setStatus(s);
2353       if (LOG.isTraceEnabled()) LOG.trace(s);
2354       // sync unflushed WAL changes
2355       // see HBASE-8208 for details
2356       if (wal != null) {
2357         try {
2358           wal.sync(); // ensure that flush marker is sync'ed
2359         } catch (IOException ioe) {
2360           wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2361           throw ioe;
2362         }
2363       }
2364 
2365       // wait for all in-progress transactions to commit to WAL before
2366       // we can start the flush. This prevents
2367       // uncommitted transactions from being written into HFiles.
2368       // We have to block before we start the flush, otherwise keys that
2369       // were removed via a rollbackMemstore could be written to Hfiles.
2370       mvcc.completeAndWait(writeEntry);
2371       // set writeEntry to null to prevent mvcc.complete from being called again inside finally
2372       // block
2373       writeEntry = null;
2374     } finally {
2375       if (writeEntry != null) {
2376         // In case of failure just mark current writeEntry as complete.
2377         mvcc.complete(writeEntry);
2378       }
2379     }
2380     return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
2381         flushOpSeqId, flushedSeqId, totalFlushableSizeOfFlushableStores);
2382   }
2383 
2384   /**
2385    * @param families
2386    * @return True if passed Set is all families in the region.
2387    */
2388   private boolean isAllFamilies(final Collection<Store> families) {
2389     return families == null || this.stores.size() == families.size();
2390   }
2391 
2392   /**
2393    * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
2394    * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
2395    * @param wal
2396    * @return whether WAL write was successful
2397    */
2398   private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
2399     if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
2400       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
2401         getRegionInfo(), -1, new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR));
2402       try {
2403         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2404           desc, true, mvcc);
2405         return true;
2406       } catch (IOException e) {
2407         LOG.warn(getRegionInfo().getEncodedName() + " : "
2408             + "Received exception while trying to write the flush request to wal", e);
2409       }
2410     }
2411     return false;
2412   }
2413 
2414   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
2415       justification="Intentional; notify is about completed flush")
2416   protected FlushResult internalFlushCacheAndCommit(
2417         final WAL wal, MonitoredTask status, final PrepareFlushResult prepareResult,
2418         final Collection<Store> storesToFlush)
2419     throws IOException {
2420 
2421     // prepare flush context is carried via PrepareFlushResult
2422     TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
2423     TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
2424     long startTime = prepareResult.startTime;
2425     long flushOpSeqId = prepareResult.flushOpSeqId;
2426     long flushedSeqId = prepareResult.flushedSeqId;
2427     long totalFlushableSizeOfFlushableStores = prepareResult.totalFlushableSize;
2428 
2429     String s = "Flushing stores of " + this;
2430     status.setStatus(s);
2431     if (LOG.isTraceEnabled()) LOG.trace(s);
2432 
2433     // Any failure from here on out will be catastrophic requiring server
2434     // restart so wal content can be replayed and put back into the memstore.
2435     // Otherwise, the snapshot content while backed up in the wal, it will not
2436     // be part of the current running servers state.
2437     boolean compactionRequested = false;
2438     try {
2439       // A.  Flush memstore to all the HStores.
2440       // Keep running vector of all store files that includes both old and the
2441       // just-made new flush store file. The new flushed file is still in the
2442       // tmp directory.
2443 
2444       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2445         flush.flushCache(status);
2446       }
2447 
2448       // Switch snapshot (in memstore) -> new hfile (thus causing
2449       // all the store scanners to reset/reseek).
2450       Iterator<Store> it = storesToFlush.iterator();
2451       // stores.values() and storeFlushCtxs have same order
2452       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2453         boolean needsCompaction = flush.commit(status);
2454         if (needsCompaction) {
2455           compactionRequested = true;
2456         }
2457         byte[] storeName = it.next().getFamily().getName();
2458         List<Path> storeCommittedFiles = flush.getCommittedFiles();
2459         committedFiles.put(storeName, storeCommittedFiles);
2460         // Flush committed no files, indicating flush is empty or flush was canceled
2461         if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) {
2462           totalFlushableSizeOfFlushableStores -= prepareResult.storeFlushableSize.get(storeName);
2463         }
2464       }
2465       storeFlushCtxs.clear();
2466 
2467       // Set down the memstore size by amount of flush.
2468       this.addAndGetGlobalMemstoreSize(-totalFlushableSizeOfFlushableStores);
2469 
2470       if (wal != null) {
2471         // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
2472         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
2473           getRegionInfo(), flushOpSeqId, committedFiles);
2474         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2475           desc, true, mvcc);
2476       }
2477     } catch (Throwable t) {
2478       // An exception here means that the snapshot was not persisted.
2479       // The wal needs to be replayed so its content is restored to memstore.
2480       // Currently, only a server restart will do this.
2481       // We used to only catch IOEs but its possible that we'd get other
2482       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
2483       // all and sundry.
2484       if (wal != null) {
2485         try {
2486           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2487             getRegionInfo(), flushOpSeqId, committedFiles);
2488           WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2489             desc, false, mvcc);
2490         } catch (Throwable ex) {
2491           LOG.warn(getRegionInfo().getEncodedName() + " : "
2492               + "failed writing ABORT_FLUSH marker to WAL", ex);
2493           // ignore this since we will be aborting the RS with DSE.
2494         }
2495         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2496       }
2497       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
2498           Bytes.toStringBinary(getRegionInfo().getRegionName()));
2499       dse.initCause(t);
2500       status.abort("Flush failed: " + StringUtils.stringifyException(t));
2501 
2502       // Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
2503       // However, since we may have the region read lock, we cannot call close(true) here since
2504       // we cannot promote to a write lock. Instead we are setting closing so that all other region
2505       // operations except for close will be rejected.
2506       this.closing.set(true);
2507 
2508       if (rsServices != null) {
2509         // This is a safeguard against the case where the caller fails to explicitly handle aborting
2510         rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
2511       }
2512 
2513       throw dse;
2514     }
2515 
2516     // If we get to here, the HStores have been written.
2517     if (wal != null) {
2518       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2519     }
2520 
2521     // Record latest flush time
2522     for (Store store: storesToFlush) {
2523       this.lastStoreFlushTimeMap.put(store, startTime);
2524     }
2525 
2526     this.maxFlushedSeqId = flushedSeqId;
2527     this.lastFlushOpSeqId = flushOpSeqId;
2528 
2529     // C. Finally notify anyone waiting on memstore to clear:
2530     // e.g. checkResources().
2531     synchronized (this) {
2532       notifyAll(); // FindBugs NN_NAKED_NOTIFY
2533     }
2534 
2535     long time = EnvironmentEdgeManager.currentTime() - startTime;
2536     long memstoresize = this.memstoreSize.get();
2537     String msg = "Finished memstore flush of ~"
2538         + StringUtils.byteDesc(totalFlushableSizeOfFlushableStores) + "/"
2539         + totalFlushableSizeOfFlushableStores + ", currentsize="
2540         + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
2541         + " for region " + this + " in " + time + "ms, sequenceid="
2542         + flushOpSeqId +  ", compaction requested=" + compactionRequested
2543         + ((wal == null) ? "; wal=null" : "");
2544     LOG.info(msg);
2545     status.setStatus(msg);
2546 
2547     return new FlushResultImpl(compactionRequested ?
2548         FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
2549           FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED,
2550         flushOpSeqId);
2551   }
2552 
2553   /**
2554    * Method to safely get the next sequence number.
2555    * @return Next sequence number unassociated with any actual edit.
2556    * @throws IOException
2557    */
2558   @VisibleForTesting
2559   protected long getNextSequenceId(final WAL wal) throws IOException {
2560     // TODO: For review. Putting an empty edit in to get a sequenceid out will not work if the
2561     // WAL is banjaxed... if it has gotten an exception and the WAL has not yet been rolled or
2562     // aborted. In this case, we'll just get stuck here. For now, until HBASE-12751, just have
2563     // a timeout. May happen in tests after we tightened the semantic via HBASE-14317.
2564     // Also, the getSequenceId blocks on a latch. There is no global list of outstanding latches
2565     // so if an abort or stop, there is no way to call them in.
2566     WALKey key = this.appendEmptyEdit(wal);
2567     mvcc.complete(key.getWriteEntry());
2568     return key.getSequenceId(this.maxWaitForSeqId);
2569   }
2570 
2571   //////////////////////////////////////////////////////////////////////////////
2572   // get() methods for client use.
2573   //////////////////////////////////////////////////////////////////////////////
2574 
2575   @Override
2576   public RegionScanner getScanner(Scan scan) throws IOException {
2577    return getScanner(scan, true);
2578   }
2579 
2580   @Override
2581   public RegionScanner getScanner(Scan scan, List<KeyValueScanner> additionalScanners)
2582       throws IOException {
2583     return getScanner(scan, additionalScanners, true);
2584   }
2585 
2586   public RegionScanner getScanner(Scan scan, boolean copyCellsFromSharedMem) throws IOException {
2587     RegionScanner scanner = getScanner(scan, null, copyCellsFromSharedMem);
2588     return scanner;
2589   }
2590 
2591   protected RegionScanner getScanner(Scan scan, List<KeyValueScanner> additionalScanners,
2592       boolean copyCellsFromSharedMem) throws IOException {
2593     startRegionOperation(Operation.SCAN);
2594     try {
2595       // Verify families are all valid
2596       if (!scan.hasFamilies()) {
2597         // Adding all families to scanner
2598         for (byte[] family : this.htableDescriptor.getFamiliesKeys()) {
2599           scan.addFamily(family);
2600         }
2601       } else {
2602         for (byte[] family : scan.getFamilyMap().keySet()) {
2603           checkFamily(family);
2604         }
2605       }
2606       return instantiateRegionScanner(scan, additionalScanners, copyCellsFromSharedMem);
2607     } finally {
2608       closeRegionOperation(Operation.SCAN);
2609     }
2610   }
2611 
2612   protected RegionScanner instantiateRegionScanner(Scan scan,
2613       List<KeyValueScanner> additionalScanners, boolean copyCellsFromSharedMem) throws IOException {
2614     if (scan.isReversed()) {
2615       if (scan.getFilter() != null) {
2616         scan.getFilter().setReversed(true);
2617       }
2618       return new ReversedRegionScannerImpl(scan, additionalScanners, this, copyCellsFromSharedMem);
2619     }
2620     return new RegionScannerImpl(scan, additionalScanners, this, copyCellsFromSharedMem);
2621   }
2622 
2623   @Override
2624   public void prepareDelete(Delete delete) throws IOException {
2625     // Check to see if this is a deleteRow insert
2626     if(delete.getFamilyCellMap().isEmpty()){
2627       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
2628         // Don't eat the timestamp
2629         delete.addFamily(family, delete.getTimeStamp());
2630       }
2631     } else {
2632       for(byte [] family : delete.getFamilyCellMap().keySet()) {
2633         if(family == null) {
2634           throw new NoSuchColumnFamilyException("Empty family is invalid");
2635         }
2636         checkFamily(family);
2637       }
2638     }
2639   }
2640 
2641   @Override
2642   public void delete(Delete delete) throws IOException {
2643     checkReadOnly();
2644     checkResources();
2645     startRegionOperation(Operation.DELETE);
2646     try {
2647       delete.getRow();
2648       // All edits for the given row (across all column families) must happen atomically.
2649       doBatchMutate(delete);
2650     } finally {
2651       closeRegionOperation(Operation.DELETE);
2652     }
2653   }
2654 
2655   /**
2656    * Row needed by below method.
2657    */
2658   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2659 
2660   /**
2661    * This is used only by unit tests. Not required to be a public API.
2662    * @param familyMap map of family to edits for the given family.
2663    * @throws IOException
2664    */
2665   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2666       Durability durability) throws IOException {
2667     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2668     delete.setFamilyCellMap(familyMap);
2669     delete.setDurability(durability);
2670     doBatchMutate(delete);
2671   }
2672 
2673   @Override
2674   public void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2675       byte[] byteNow) throws IOException {
2676     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2677 
2678       byte[] family = e.getKey();
2679       List<Cell> cells = e.getValue();
2680       assert cells instanceof RandomAccess;
2681 
2682       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2683       int listSize = cells.size();
2684       for (int i=0; i < listSize; i++) {
2685         Cell cell = cells.get(i);
2686         //  Check if time is LATEST, change to time of most recent addition if so
2687         //  This is expensive.
2688         if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && CellUtil.isDeleteType(cell)) {
2689           byte[] qual = CellUtil.cloneQualifier(cell);
2690           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2691 
2692           Integer count = kvCount.get(qual);
2693           if (count == null) {
2694             kvCount.put(qual, 1);
2695           } else {
2696             kvCount.put(qual, count + 1);
2697           }
2698           count = kvCount.get(qual);
2699 
2700           Get get = new Get(CellUtil.cloneRow(cell));
2701           get.setMaxVersions(count);
2702           get.addColumn(family, qual);
2703           if (coprocessorHost != null) {
2704             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2705                 byteNow, get)) {
2706               updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2707             }
2708           } else {
2709             updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2710           }
2711         } else {
2712           CellUtil.updateLatestStamp(cell, byteNow, 0);
2713         }
2714       }
2715     }
2716   }
2717 
2718   void updateDeleteLatestVersionTimeStamp(Cell cell, Get get, int count, byte[] byteNow)
2719       throws IOException {
2720     List<Cell> result = get(get, false);
2721 
2722     if (result.size() < count) {
2723       // Nothing to delete
2724       CellUtil.updateLatestStamp(cell, byteNow, 0);
2725       return;
2726     }
2727     if (result.size() > count) {
2728       throw new RuntimeException("Unexpected size: " + result.size());
2729     }
2730     Cell getCell = result.get(count - 1);
2731     CellUtil.setTimestamp(cell, getCell.getTimestamp());
2732   }
2733 
2734   @Override
2735   public void put(Put put) throws IOException {
2736     checkReadOnly();
2737 
2738     // Do a rough check that we have resources to accept a write.  The check is
2739     // 'rough' in that between the resource check and the call to obtain a
2740     // read lock, resources may run out.  For now, the thought is that this
2741     // will be extremely rare; we'll deal with it when it happens.
2742     checkResources();
2743     startRegionOperation(Operation.PUT);
2744     try {
2745       // All edits for the given row (across all column families) must happen atomically.
2746       doBatchMutate(put);
2747     } finally {
2748       closeRegionOperation(Operation.PUT);
2749     }
2750   }
2751 
2752   /**
2753    * Struct-like class that tracks the progress of a batch operation,
2754    * accumulating status codes and tracking the index at which processing
2755    * is proceeding.
2756    */
2757   private abstract static class BatchOperationInProgress<T> {
2758     T[] operations;
2759     int nextIndexToProcess = 0;
2760     OperationStatus[] retCodeDetails;
2761     WALEdit[] walEditsFromCoprocessors;
2762 
2763     public BatchOperationInProgress(T[] operations) {
2764       this.operations = operations;
2765       this.retCodeDetails = new OperationStatus[operations.length];
2766       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2767       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2768     }
2769 
2770     public abstract Mutation getMutation(int index);
2771     public abstract long getNonceGroup(int index);
2772     public abstract long getNonce(int index);
2773     /** This method is potentially expensive and should only be used for non-replay CP path. */
2774     public abstract Mutation[] getMutationsForCoprocs();
2775     public abstract boolean isInReplay();
2776     public abstract long getReplaySequenceId();
2777 
2778     public boolean isDone() {
2779       return nextIndexToProcess == operations.length;
2780     }
2781   }
2782 
2783   private static class MutationBatch extends BatchOperationInProgress<Mutation> {
2784     private long nonceGroup;
2785     private long nonce;
2786     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2787       super(operations);
2788       this.nonceGroup = nonceGroup;
2789       this.nonce = nonce;
2790     }
2791 
2792     @Override
2793     public Mutation getMutation(int index) {
2794       return this.operations[index];
2795     }
2796 
2797     @Override
2798     public long getNonceGroup(int index) {
2799       return nonceGroup;
2800     }
2801 
2802     @Override
2803     public long getNonce(int index) {
2804       return nonce;
2805     }
2806 
2807     @Override
2808     public Mutation[] getMutationsForCoprocs() {
2809       return this.operations;
2810     }
2811 
2812     @Override
2813     public boolean isInReplay() {
2814       return false;
2815     }
2816 
2817     @Override
2818     public long getReplaySequenceId() {
2819       return 0;
2820     }
2821   }
2822 
2823   private static class ReplayBatch extends BatchOperationInProgress<MutationReplay> {
2824     private long replaySeqId = 0;
2825     public ReplayBatch(MutationReplay[] operations, long seqId) {
2826       super(operations);
2827       this.replaySeqId = seqId;
2828     }
2829 
2830     @Override
2831     public Mutation getMutation(int index) {
2832       return this.operations[index].mutation;
2833     }
2834 
2835     @Override
2836     public long getNonceGroup(int index) {
2837       return this.operations[index].nonceGroup;
2838     }
2839 
2840     @Override
2841     public long getNonce(int index) {
2842       return this.operations[index].nonce;
2843     }
2844 
2845     @Override
2846     public Mutation[] getMutationsForCoprocs() {
2847       assert false;
2848       throw new RuntimeException("Should not be called for replay batch");
2849     }
2850 
2851     @Override
2852     public boolean isInReplay() {
2853       return true;
2854     }
2855 
2856     @Override
2857     public long getReplaySequenceId() {
2858       return this.replaySeqId;
2859     }
2860   }
2861 
2862   @Override
2863   public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce)
2864       throws IOException {
2865     // As it stands, this is used for 3 things
2866     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2867     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2868     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2869     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2870   }
2871 
2872   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2873     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2874   }
2875 
2876   @Override
2877   public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
2878       throws IOException {
2879     if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
2880         && replaySeqId < lastReplayedOpenRegionSeqId) {
2881       // if it is a secondary replica we should ignore these entries silently
2882       // since they are coming out of order
2883       if (LOG.isTraceEnabled()) {
2884         LOG.trace(getRegionInfo().getEncodedName() + " : "
2885           + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
2886           + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
2887         for (MutationReplay mut : mutations) {
2888           LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
2889         }
2890       }
2891 
2892       OperationStatus[] statuses = new OperationStatus[mutations.length];
2893       for (int i = 0; i < statuses.length; i++) {
2894         statuses[i] = OperationStatus.SUCCESS;
2895       }
2896       return statuses;
2897     }
2898     return batchMutate(new ReplayBatch(mutations, replaySeqId));
2899   }
2900 
2901   /**
2902    * Perform a batch of mutations.
2903    * It supports only Put and Delete mutations and will ignore other types passed.
2904    * @param batchOp contains the list of mutations
2905    * @return an array of OperationStatus which internally contains the
2906    *         OperationStatusCode and the exceptionMessage if any.
2907    * @throws IOException
2908    */
2909   OperationStatus[] batchMutate(BatchOperationInProgress<?> batchOp) throws IOException {
2910     boolean initialized = false;
2911     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
2912     startRegionOperation(op);
2913     try {
2914       while (!batchOp.isDone()) {
2915         if (!batchOp.isInReplay()) {
2916           checkReadOnly();
2917         }
2918         checkResources();
2919 
2920         if (!initialized) {
2921           this.writeRequestsCount.add(batchOp.operations.length);
2922           if (!batchOp.isInReplay()) {
2923             doPreMutationHook(batchOp);
2924           }
2925           initialized = true;
2926         }
2927         long addedSize = doMiniBatchMutation(batchOp);
2928         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
2929         if (isFlushSize(newSize)) {
2930           requestFlush();
2931         }
2932       }
2933     } finally {
2934       closeRegionOperation(op);
2935     }
2936     return batchOp.retCodeDetails;
2937   }
2938 
2939 
2940   private void doPreMutationHook(BatchOperationInProgress<?> batchOp)
2941       throws IOException {
2942     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2943     WALEdit walEdit = new WALEdit();
2944     if (coprocessorHost != null) {
2945       for (int i = 0 ; i < batchOp.operations.length; i++) {
2946         Mutation m = batchOp.getMutation(i);
2947         if (m instanceof Put) {
2948           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2949             // pre hook says skip this Put
2950             // mark as success and skip in doMiniBatchMutation
2951             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2952           }
2953         } else if (m instanceof Delete) {
2954           Delete curDel = (Delete) m;
2955           if (curDel.getFamilyCellMap().isEmpty()) {
2956             // handle deleting a row case
2957             prepareDelete(curDel);
2958           }
2959           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2960             // pre hook says skip this Delete
2961             // mark as success and skip in doMiniBatchMutation
2962             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2963           }
2964         } else {
2965           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2966           // mark the operation return code as failure so that it will not be considered in
2967           // the doMiniBatchMutation
2968           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2969               "Put/Delete mutations only supported in batchMutate() now");
2970         }
2971         if (!walEdit.isEmpty()) {
2972           batchOp.walEditsFromCoprocessors[i] = walEdit;
2973           walEdit = new WALEdit();
2974         }
2975       }
2976     }
2977   }
2978 
2979   @SuppressWarnings("unchecked")
2980   private long doMiniBatchMutation(BatchOperationInProgress<?> batchOp) throws IOException {
2981     boolean isInReplay = batchOp.isInReplay();
2982     // variable to note if all Put items are for the same CF -- metrics related
2983     boolean putsCfSetConsistent = true;
2984     //The set of columnFamilies first seen for Put.
2985     Set<byte[]> putsCfSet = null;
2986     // variable to note if all Delete items are for the same CF -- metrics related
2987     boolean deletesCfSetConsistent = true;
2988     //The set of columnFamilies first seen for Delete.
2989     Set<byte[]> deletesCfSet = null;
2990 
2991     long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE;
2992     WALEdit walEdit = new WALEdit(isInReplay);
2993     MultiVersionConcurrencyControl.WriteEntry writeEntry = null;
2994     long txid = 0;
2995     boolean doRollBackMemstore = false;
2996     boolean locked = false;
2997 
2998     /** Keep track of the locks we hold so we can release them in finally clause */
2999     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
3000     // reference family maps directly so coprocessors can mutate them if desired
3001     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
3002     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
3003     int firstIndex = batchOp.nextIndexToProcess;
3004     int lastIndexExclusive = firstIndex;
3005     boolean success = false;
3006     int noOfPuts = 0, noOfDeletes = 0;
3007     WALKey walKey = null;
3008     long mvccNum = 0;
3009     try {
3010       // ------------------------------------
3011       // STEP 1. Try to acquire as many locks as we can, and ensure
3012       // we acquire at least one.
3013       // ----------------------------------
3014       int numReadyToWrite = 0;
3015       long now = EnvironmentEdgeManager.currentTime();
3016       while (lastIndexExclusive < batchOp.operations.length) {
3017         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
3018         boolean isPutMutation = mutation instanceof Put;
3019 
3020         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
3021         // store the family map reference to allow for mutations
3022         familyMaps[lastIndexExclusive] = familyMap;
3023 
3024         // skip anything that "ran" already
3025         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
3026             != OperationStatusCode.NOT_RUN) {
3027           lastIndexExclusive++;
3028           continue;
3029         }
3030 
3031         try {
3032           if (isPutMutation) {
3033             // Check the families in the put. If bad, skip this one.
3034             if (isInReplay) {
3035               removeNonExistentColumnFamilyForReplay(familyMap);
3036             } else {
3037               checkFamilies(familyMap.keySet());
3038             }
3039             checkTimestamps(mutation.getFamilyCellMap(), now);
3040           } else {
3041             prepareDelete((Delete) mutation);
3042           }
3043           checkRow(mutation.getRow(), "doMiniBatchMutation");
3044         } catch (NoSuchColumnFamilyException nscf) {
3045           LOG.warn("No such column family in batch mutation", nscf);
3046           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3047               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
3048           lastIndexExclusive++;
3049           continue;
3050         } catch (FailedSanityCheckException fsce) {
3051           LOG.warn("Batch Mutation did not pass sanity check", fsce);
3052           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3053               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
3054           lastIndexExclusive++;
3055           continue;
3056         } catch (WrongRegionException we) {
3057           LOG.warn("Batch mutation had a row that does not belong to this region", we);
3058           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3059               OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
3060           lastIndexExclusive++;
3061           continue;
3062         }
3063 
3064         // If we haven't got any rows in our batch, we should block to
3065         // get the next one.
3066         RowLock rowLock = null;
3067         try {
3068           rowLock = getRowLock(mutation.getRow(), true);
3069         } catch (IOException ioe) {
3070           LOG.warn("Failed getting lock in batch put, row="
3071             + Bytes.toStringBinary(mutation.getRow()), ioe);
3072         }
3073         if (rowLock == null) {
3074           // We failed to grab another lock
3075           break; // stop acquiring more rows for this batch
3076         } else {
3077           acquiredRowLocks.add(rowLock);
3078         }
3079 
3080         lastIndexExclusive++;
3081         numReadyToWrite++;
3082 
3083         if (isPutMutation) {
3084           // If Column Families stay consistent through out all of the
3085           // individual puts then metrics can be reported as a mutliput across
3086           // column families in the first put.
3087           if (putsCfSet == null) {
3088             putsCfSet = mutation.getFamilyCellMap().keySet();
3089           } else {
3090             putsCfSetConsistent = putsCfSetConsistent
3091                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
3092           }
3093         } else {
3094           if (deletesCfSet == null) {
3095             deletesCfSet = mutation.getFamilyCellMap().keySet();
3096           } else {
3097             deletesCfSetConsistent = deletesCfSetConsistent
3098                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
3099           }
3100         }
3101       }
3102 
3103       // we should record the timestamp only after we have acquired the rowLock,
3104       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
3105       now = EnvironmentEdgeManager.currentTime();
3106       byte[] byteNow = Bytes.toBytes(now);
3107 
3108       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
3109       if (numReadyToWrite <= 0) return 0L;
3110 
3111       // We've now grabbed as many mutations off the list as we can
3112 
3113       // ------------------------------------
3114       // STEP 2. Update any LATEST_TIMESTAMP timestamps
3115       // ----------------------------------
3116       for (int i = firstIndex; !isInReplay && i < lastIndexExclusive; i++) {
3117         // skip invalid
3118         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3119             != OperationStatusCode.NOT_RUN) continue;
3120 
3121         Mutation mutation = batchOp.getMutation(i);
3122         if (mutation instanceof Put) {
3123           updateCellTimestamps(familyMaps[i].values(), byteNow);
3124           noOfPuts++;
3125         } else {
3126           prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
3127           noOfDeletes++;
3128         }
3129         rewriteCellTags(familyMaps[i], mutation);
3130       }
3131 
3132       lock(this.updatesLock.readLock(), numReadyToWrite);
3133       locked = true;
3134 
3135       // calling the pre CP hook for batch mutation
3136       if (!isInReplay && coprocessorHost != null) {
3137         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3138           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3139           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3140         if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L;
3141       }
3142 
3143       // ------------------------------------
3144       // STEP 3. Build WAL edit
3145       // ----------------------------------
3146       Durability durability = Durability.USE_DEFAULT;
3147       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3148         // Skip puts that were determined to be invalid during preprocessing
3149         if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) {
3150           continue;
3151         }
3152 
3153         Mutation m = batchOp.getMutation(i);
3154         Durability tmpDur = getEffectiveDurability(m.getDurability());
3155         if (tmpDur.ordinal() > durability.ordinal()) {
3156           durability = tmpDur;
3157         }
3158         if (tmpDur == Durability.SKIP_WAL) {
3159           recordMutationWithoutWal(m.getFamilyCellMap());
3160           continue;
3161         }
3162 
3163         long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i);
3164         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
3165         // Given how nonces are originally written, these should be contiguous.
3166         // They don't have to be, it will still work, just write more WALEdits than needed.
3167         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
3168           if (walEdit.size() > 0) {
3169             assert isInReplay;
3170             if (!isInReplay) {
3171               throw new IOException("Multiple nonces per batch and not in replay");
3172             }
3173             // txid should always increase, so having the one from the last call is ok.
3174             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3175             walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3176               this.htableDescriptor.getTableName(), now, m.getClusterIds(),
3177               currentNonceGroup, currentNonce, mvcc);
3178             txid = this.wal.append(this.htableDescriptor,  this.getRegionInfo(),  walKey,
3179               walEdit, true);
3180             walEdit = new WALEdit(isInReplay);
3181             walKey = null;
3182           }
3183           currentNonceGroup = nonceGroup;
3184           currentNonce = nonce;
3185         }
3186 
3187         // Add WAL edits by CP
3188         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3189         if (fromCP != null) {
3190           for (Cell cell : fromCP.getCells()) {
3191             walEdit.add(cell);
3192           }
3193         }
3194         addFamilyMapToWALEdit(familyMaps[i], walEdit);
3195       }
3196 
3197       // -------------------------
3198       // STEP 4. Append the final edit to WAL. Do not sync wal.
3199       // -------------------------
3200       Mutation mutation = batchOp.getMutation(firstIndex);
3201       if (isInReplay) {
3202         // use wal key from the original
3203         walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3204           this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3205           mutation.getClusterIds(), currentNonceGroup, currentNonce, mvcc);
3206         long replaySeqId = batchOp.getReplaySequenceId();
3207         walKey.setOrigLogSeqNum(replaySeqId);
3208       }
3209       if (walEdit.size() > 0) {
3210         if (!isInReplay) {
3211         // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3212         walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3213             this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3214             mutation.getClusterIds(), currentNonceGroup, currentNonce, mvcc);
3215         }
3216         txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit, true);
3217       }
3218       // ------------------------------------
3219       // Acquire the latest mvcc number
3220       // ----------------------------------
3221       if (walKey == null) {
3222         // If this is a skip wal operation just get the read point from mvcc
3223         walKey = this.appendEmptyEdit(this.wal);
3224       }
3225       if (!isInReplay) {
3226         writeEntry = walKey.getWriteEntry();
3227         mvccNum = writeEntry.getWriteNumber();
3228       } else {
3229         mvccNum = batchOp.getReplaySequenceId();
3230       }
3231 
3232       // ------------------------------------
3233       // STEP 5. Write back to memstore
3234       // Write to memstore. It is ok to write to memstore
3235       // first without syncing the WAL because we do not roll
3236       // forward the memstore MVCC. The MVCC will be moved up when
3237       // the complete operation is done. These changes are not yet
3238       // visible to scanners till we update the MVCC. The MVCC is
3239       // moved only when the sync is complete.
3240       // ----------------------------------
3241       long addedSize = 0;
3242       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3243         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3244             != OperationStatusCode.NOT_RUN) {
3245           continue;
3246         }
3247         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
3248         addedSize += applyFamilyMapToMemstore(familyMaps[i], mvccNum, isInReplay);
3249       }
3250 
3251       // -------------------------------
3252       // STEP 6. Release row locks, etc.
3253       // -------------------------------
3254       if (locked) {
3255         this.updatesLock.readLock().unlock();
3256         locked = false;
3257       }
3258       releaseRowLocks(acquiredRowLocks);
3259 
3260       // -------------------------
3261       // STEP 7. Sync wal.
3262       // -------------------------
3263       if (txid != 0) {
3264         syncOrDefer(txid, durability);
3265       }
3266 
3267       doRollBackMemstore = false;
3268       // calling the post CP hook for batch mutation
3269       if (!isInReplay && coprocessorHost != null) {
3270         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3271           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3272           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3273         coprocessorHost.postBatchMutate(miniBatchOp);
3274       }
3275 
3276       // ------------------------------------------------------------------
3277       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
3278       // ------------------------------------------------------------------
3279       if (writeEntry != null) {
3280         mvcc.completeAndWait(writeEntry);
3281         writeEntry = null;
3282       } else if (isInReplay) {
3283         // ensure that the sequence id of the region is at least as big as orig log seq id
3284         mvcc.advanceTo(mvccNum);
3285       }
3286 
3287       for (int i = firstIndex; i < lastIndexExclusive; i ++) {
3288         if (batchOp.retCodeDetails[i] == OperationStatus.NOT_RUN) {
3289           batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3290         }
3291       }
3292 
3293       // ------------------------------------
3294       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
3295       // synced so that the coprocessor contract is adhered to.
3296       // ------------------------------------
3297       if (!isInReplay && coprocessorHost != null) {
3298         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3299           // only for successful puts
3300           if (batchOp.retCodeDetails[i].getOperationStatusCode()
3301               != OperationStatusCode.SUCCESS) {
3302             continue;
3303           }
3304           Mutation m = batchOp.getMutation(i);
3305           if (m instanceof Put) {
3306             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
3307           } else {
3308             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
3309           }
3310         }
3311       }
3312 
3313       success = true;
3314       return addedSize;
3315     } finally {
3316       // if the wal sync was unsuccessful, remove keys from memstore
3317       if (doRollBackMemstore) {
3318         for (int j = 0; j < familyMaps.length; j++) {
3319           for(List<Cell> cells:familyMaps[j].values()) {
3320             rollbackMemstore(cells);
3321           }
3322         }
3323         if (writeEntry != null) mvcc.complete(writeEntry);
3324       } else if (writeEntry != null) {
3325         mvcc.completeAndWait(writeEntry);
3326       }
3327 
3328       if (locked) {
3329         this.updatesLock.readLock().unlock();
3330       }
3331       releaseRowLocks(acquiredRowLocks);
3332 
3333       // See if the column families were consistent through the whole thing.
3334       // if they were then keep them. If they were not then pass a null.
3335       // null will be treated as unknown.
3336       // Total time taken might be involving Puts and Deletes.
3337       // Split the time for puts and deletes based on the total number of Puts and Deletes.
3338 
3339       if (noOfPuts > 0) {
3340         // There were some Puts in the batch.
3341         if (this.metricsRegion != null) {
3342           this.metricsRegion.updatePut();
3343         }
3344       }
3345       if (noOfDeletes > 0) {
3346         // There were some Deletes in the batch.
3347         if (this.metricsRegion != null) {
3348           this.metricsRegion.updateDelete();
3349         }
3350       }
3351       if (!success) {
3352         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3353           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
3354             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
3355           }
3356         }
3357       }
3358       if (coprocessorHost != null && !batchOp.isInReplay()) {
3359         // call the coprocessor hook to do any finalization steps
3360         // after the put is done
3361         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3362             new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3363                 batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex,
3364                 lastIndexExclusive);
3365         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
3366       }
3367 
3368       batchOp.nextIndexToProcess = lastIndexExclusive;
3369     }
3370   }
3371 
3372   /**
3373    * Returns effective durability from the passed durability and
3374    * the table descriptor.
3375    */
3376   protected Durability getEffectiveDurability(Durability d) {
3377     return d == Durability.USE_DEFAULT ? this.durability : d;
3378   }
3379 
3380   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3381   //the getting of the lock happens before, so that you would just pass it into
3382   //the methods. So in the case of checkAndMutate you could just do lockRow,
3383   //get, put, unlockRow or something
3384 
3385   @Override
3386   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
3387       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
3388       boolean writeToWAL)
3389   throws IOException{
3390     checkReadOnly();
3391     //TODO, add check for value length or maybe even better move this to the
3392     //client if this becomes a global setting
3393     checkResources();
3394     boolean isPut = w instanceof Put;
3395     if (!isPut && !(w instanceof Delete))
3396       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
3397           "be Put or Delete");
3398     if (!Bytes.equals(row, w.getRow())) {
3399       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
3400           "getRow must match the passed row");
3401     }
3402 
3403     startRegionOperation();
3404     try {
3405       Get get = new Get(row);
3406       checkFamily(family);
3407       get.addColumn(family, qualifier);
3408 
3409       // Lock row - note that doBatchMutate will relock this row if called
3410       RowLock rowLock = getRowLock(get.getRow());
3411       // wait for all previous transactions to complete (with lock held)
3412       mvcc.await();
3413       try {
3414         if (this.getCoprocessorHost() != null) {
3415           Boolean processed = null;
3416           if (w instanceof Put) {
3417             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
3418                 qualifier, compareOp, comparator, (Put) w);
3419           } else if (w instanceof Delete) {
3420             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
3421                 qualifier, compareOp, comparator, (Delete) w);
3422           }
3423           if (processed != null) {
3424             return processed;
3425           }
3426         }
3427         List<Cell> result = get(get, false);
3428 
3429         boolean valueIsNull = comparator.getValue() == null ||
3430           comparator.getValue().length == 0;
3431         boolean matches = false;
3432         long cellTs = 0;
3433         if (result.size() == 0 && valueIsNull) {
3434           matches = true;
3435         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3436             valueIsNull) {
3437           matches = true;
3438           cellTs = result.get(0).getTimestamp();
3439         } else if (result.size() == 1 && !valueIsNull) {
3440           Cell kv = result.get(0);
3441           cellTs = kv.getTimestamp();
3442           int compareResult = CellComparator.compareValue(kv, comparator);
3443           switch (compareOp) {
3444           case LESS:
3445             matches = compareResult < 0;
3446             break;
3447           case LESS_OR_EQUAL:
3448             matches = compareResult <= 0;
3449             break;
3450           case EQUAL:
3451             matches = compareResult == 0;
3452             break;
3453           case NOT_EQUAL:
3454             matches = compareResult != 0;
3455             break;
3456           case GREATER_OR_EQUAL:
3457             matches = compareResult >= 0;
3458             break;
3459           case GREATER:
3460             matches = compareResult > 0;
3461             break;
3462           default:
3463             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3464           }
3465         }
3466         //If matches put the new put or delete the new delete
3467         if (matches) {
3468           // We have acquired the row lock already. If the system clock is NOT monotonically
3469           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
3470           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
3471           // there is no way to pass the cellTs. See HBASE-14054.
3472           long now = EnvironmentEdgeManager.currentTime();
3473           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
3474           byte[] byteTs = Bytes.toBytes(ts);
3475 
3476           if (w instanceof Put) {
3477             updateCellTimestamps(w.getFamilyCellMap().values(), byteTs);
3478           }
3479           // else delete is not needed since it already does a second get, and sets the timestamp
3480           // from get (see prepareDeleteTimestamps).
3481 
3482           // All edits for the given row (across all column families) must
3483           // happen atomically.
3484           doBatchMutate(w);
3485           this.checkAndMutateChecksPassed.increment();
3486           return true;
3487         }
3488         this.checkAndMutateChecksFailed.increment();
3489         return false;
3490       } finally {
3491         rowLock.release();
3492       }
3493     } finally {
3494       closeRegionOperation();
3495     }
3496   }
3497 
3498   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3499   //the getting of the lock happens before, so that you would just pass it into
3500   //the methods. So in the case of checkAndMutate you could just do lockRow,
3501   //get, put, unlockRow or something
3502 
3503   @Override
3504   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3505       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
3506       boolean writeToWAL) throws IOException {
3507     checkReadOnly();
3508     //TODO, add check for value length or maybe even better move this to the
3509     //client if this becomes a global setting
3510     checkResources();
3511 
3512     startRegionOperation();
3513     try {
3514       Get get = new Get(row);
3515       checkFamily(family);
3516       get.addColumn(family, qualifier);
3517 
3518       // Lock row - note that doBatchMutate will relock this row if called
3519       RowLock rowLock = getRowLock(get.getRow());
3520       // wait for all previous transactions to complete (with lock held)
3521       mvcc.await();
3522       try {
3523         List<Cell> result = get(get, false);
3524 
3525         boolean valueIsNull = comparator.getValue() == null ||
3526             comparator.getValue().length == 0;
3527         boolean matches = false;
3528         long cellTs = 0;
3529         if (result.size() == 0 && valueIsNull) {
3530           matches = true;
3531         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3532             valueIsNull) {
3533           matches = true;
3534           cellTs = result.get(0).getTimestamp();
3535         } else if (result.size() == 1 && !valueIsNull) {
3536           Cell kv = result.get(0);
3537           cellTs = kv.getTimestamp();
3538           int compareResult = CellComparator.compareValue(kv, comparator);
3539           switch (compareOp) {
3540           case LESS:
3541             matches = compareResult < 0;
3542             break;
3543           case LESS_OR_EQUAL:
3544             matches = compareResult <= 0;
3545             break;
3546           case EQUAL:
3547             matches = compareResult == 0;
3548             break;
3549           case NOT_EQUAL:
3550             matches = compareResult != 0;
3551             break;
3552           case GREATER_OR_EQUAL:
3553             matches = compareResult >= 0;
3554             break;
3555           case GREATER:
3556             matches = compareResult > 0;
3557             break;
3558           default:
3559             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3560           }
3561         }
3562         //If matches put the new put or delete the new delete
3563         if (matches) {
3564           // We have acquired the row lock already. If the system clock is NOT monotonically
3565           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
3566           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
3567           // there is no way to pass the cellTs. See HBASE-14054.
3568           long now = EnvironmentEdgeManager.currentTime();
3569           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
3570           byte[] byteTs = Bytes.toBytes(ts);
3571 
3572           for (Mutation w : rm.getMutations()) {
3573             if (w instanceof Put) {
3574               updateCellTimestamps(w.getFamilyCellMap().values(), byteTs);
3575             }
3576             // else delete is not needed since it already does a second get, and sets the timestamp
3577             // from get (see prepareDeleteTimestamps).
3578           }
3579 
3580           // All edits for the given row (across all column families) must
3581           // happen atomically.
3582           mutateRow(rm);
3583           this.checkAndMutateChecksPassed.increment();
3584           return true;
3585         }
3586         this.checkAndMutateChecksFailed.increment();
3587         return false;
3588       } finally {
3589         rowLock.release();
3590       }
3591     } finally {
3592       closeRegionOperation();
3593     }
3594   }
3595 
3596   private void doBatchMutate(Mutation mutation) throws IOException {
3597     // Currently this is only called for puts and deletes, so no nonces.
3598     OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation});
3599     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
3600       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
3601     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
3602       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
3603     }
3604   }
3605 
3606   /**
3607    * Complete taking the snapshot on the region. Writes the region info and adds references to the
3608    * working snapshot directory.
3609    *
3610    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
3611    * arg.  (In the future other cancellable HRegion methods could eventually add a
3612    * {@link ForeignExceptionSnare}, or we could do something fancier).
3613    *
3614    * @param desc snapshot description object
3615    * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
3616    *   bail out.  This is allowed to be null and will just be ignored in that case.
3617    * @throws IOException if there is an external or internal error causing the snapshot to fail
3618    */
3619   public void addRegionToSnapshot(SnapshotDescription desc,
3620       ForeignExceptionSnare exnSnare) throws IOException {
3621     Path rootDir = FSUtils.getRootDir(conf);
3622     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
3623 
3624     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
3625             snapshotDir, desc, exnSnare);
3626     manifest.addRegion(this);
3627 
3628     // The regionserver holding the first region of the table is responsible for taking the
3629     // manifest of the mob dir.
3630     if (!Bytes.equals(getRegionInfo().getStartKey(), HConstants.EMPTY_START_ROW))
3631       return;
3632 
3633     // if any cf's have is mob enabled, add the "mob region" to the manifest.
3634     List<Store> stores = getStores();
3635     for (Store store : stores) {
3636       boolean hasMobStore = store.getFamily().isMobEnabled();
3637       if (hasMobStore) {
3638         // use the .mob as the start key and 0 as the regionid
3639         HRegionInfo mobRegionInfo = MobUtils.getMobRegionInfo(this.getTableDesc().getTableName());
3640         mobRegionInfo.setOffline(true);
3641         manifest.addMobRegion(mobRegionInfo, this.getTableDesc().getColumnFamilies());
3642         return;
3643       }
3644     }
3645   }
3646 
3647   @Override
3648   public void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now)
3649       throws IOException {
3650     for (List<Cell> cells: cellItr) {
3651       if (cells == null) continue;
3652       assert cells instanceof RandomAccess;
3653       int listSize = cells.size();
3654       for (int i = 0; i < listSize; i++) {
3655         CellUtil.updateLatestStamp(cells.get(i), now, 0);
3656       }
3657     }
3658   }
3659 
3660   /**
3661    * Possibly rewrite incoming cell tags.
3662    */
3663   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
3664     // Check if we have any work to do and early out otherwise
3665     // Update these checks as more logic is added here
3666 
3667     if (m.getTTL() == Long.MAX_VALUE) {
3668       return;
3669     }
3670 
3671     // From this point we know we have some work to do
3672 
3673     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
3674       List<Cell> cells = e.getValue();
3675       assert cells instanceof RandomAccess;
3676       int listSize = cells.size();
3677       for (int i = 0; i < listSize; i++) {
3678         Cell cell = cells.get(i);
3679         List<Tag> newTags = new ArrayList<Tag>();
3680         Iterator<Tag> tagIterator = CellUtil.tagsIterator(cell);
3681 
3682         // Carry forward existing tags
3683 
3684         while (tagIterator.hasNext()) {
3685 
3686           // Add any filters or tag specific rewrites here
3687 
3688           newTags.add(tagIterator.next());
3689         }
3690 
3691         // Cell TTL handling
3692 
3693         // Check again if we need to add a cell TTL because early out logic
3694         // above may change when there are more tag based features in core.
3695         if (m.getTTL() != Long.MAX_VALUE) {
3696           // Add a cell TTL tag
3697           newTags.add(new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(m.getTTL())));
3698         }
3699 
3700         // Rewrite the cell with the updated set of tags
3701         cells.set(i, new TagRewriteCell(cell, TagUtil.fromList(newTags)));
3702       }
3703     }
3704   }
3705 
3706   /*
3707    * Check if resources to support an update.
3708    *
3709    * We throw RegionTooBusyException if above memstore limit
3710    * and expect client to retry using some kind of backoff
3711   */
3712   private void checkResources() throws RegionTooBusyException {
3713     // If catalog region, do not impose resource constraints or block updates.
3714     if (this.getRegionInfo().isMetaRegion()) return;
3715 
3716     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3717       blockedRequestsCount.increment();
3718       requestFlush();
3719       throw new RegionTooBusyException("Above memstore limit, " +
3720           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3721           this.getRegionInfo().getRegionNameAsString()) +
3722           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3723           this.getRegionServerServices().getServerName()) +
3724           ", memstoreSize=" + memstoreSize.get() +
3725           ", blockingMemStoreSize=" + blockingMemStoreSize);
3726     }
3727   }
3728 
3729   /**
3730    * @throws IOException Throws exception if region is in read-only mode.
3731    */
3732   protected void checkReadOnly() throws IOException {
3733     if (isReadOnly()) {
3734       throw new DoNotRetryIOException("region is read only");
3735     }
3736   }
3737 
3738   protected void checkReadsEnabled() throws IOException {
3739     if (!this.writestate.readsEnabled) {
3740       throw new IOException(getRegionInfo().getEncodedName()
3741         + ": The region's reads are disabled. Cannot serve the request");
3742     }
3743   }
3744 
3745   public void setReadsEnabled(boolean readsEnabled) {
3746    if (readsEnabled && !this.writestate.readsEnabled) {
3747      LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
3748     }
3749     this.writestate.setReadsEnabled(readsEnabled);
3750   }
3751 
3752   /**
3753    * Add updates first to the wal and then add values to memstore.
3754    * Warning: Assumption is caller has lock on passed in row.
3755    * @param edits Cell updates by column
3756    * @throws IOException
3757    */
3758   private void put(final byte [] row, byte [] family, List<Cell> edits)
3759   throws IOException {
3760     NavigableMap<byte[], List<Cell>> familyMap;
3761     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3762 
3763     familyMap.put(family, edits);
3764     Put p = new Put(row);
3765     p.setFamilyCellMap(familyMap);
3766     doBatchMutate(p);
3767   }
3768 
3769   /**
3770    * Atomically apply the given map of family->edits to the memstore.
3771    * This handles the consistency control on its own, but the caller
3772    * should already have locked updatesLock.readLock(). This also does
3773    * <b>not</b> check the families for validity.
3774    *
3775    * @param familyMap Map of kvs per family
3776    * @param mvccNum The MVCC for this transaction.
3777    * @param isInReplay true when adding replayed KVs into memstore
3778    * @return the additional memory usage of the memstore caused by the
3779    * new entries.
3780    */
3781   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
3782     long mvccNum, boolean isInReplay) throws IOException {
3783     long size = 0;
3784 
3785     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3786       byte[] family = e.getKey();
3787       List<Cell> cells = e.getValue();
3788       assert cells instanceof RandomAccess;
3789       Store store = getStore(family);
3790       int listSize = cells.size();
3791       for (int i=0; i < listSize; i++) {
3792         Cell cell = cells.get(i);
3793         if (cell.getSequenceId() == 0 || isInReplay) {
3794           CellUtil.setSequenceId(cell, mvccNum);
3795         }
3796         size += store.add(cell);
3797       }
3798     }
3799 
3800      return size;
3801    }
3802 
3803   /**
3804    * Remove all the keys listed in the map from the memstore. This method is
3805    * called when a Put/Delete has updated memstore but subsequently fails to update
3806    * the wal. This method is then invoked to rollback the memstore.
3807    */
3808   private void rollbackMemstore(List<Cell> memstoreCells) {
3809     int kvsRolledback = 0;
3810 
3811     for (Cell cell : memstoreCells) {
3812       byte[] family = CellUtil.cloneFamily(cell);
3813       Store store = getStore(family);
3814       store.rollback(cell);
3815       kvsRolledback++;
3816     }
3817     LOG.debug("rollbackMemstore rolled back " + kvsRolledback);
3818   }
3819 
3820   @Override
3821   public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException {
3822     for (byte[] family : families) {
3823       checkFamily(family);
3824     }
3825   }
3826 
3827   /**
3828    * During replay, there could exist column families which are removed between region server
3829    * failure and replay
3830    */
3831   private void removeNonExistentColumnFamilyForReplay(
3832       final Map<byte[], List<Cell>> familyMap) {
3833     List<byte[]> nonExistentList = null;
3834     for (byte[] family : familyMap.keySet()) {
3835       if (!this.htableDescriptor.hasFamily(family)) {
3836         if (nonExistentList == null) {
3837           nonExistentList = new ArrayList<byte[]>();
3838         }
3839         nonExistentList.add(family);
3840       }
3841     }
3842     if (nonExistentList != null) {
3843       for (byte[] family : nonExistentList) {
3844         // Perhaps schema was changed between crash and replay
3845         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3846         familyMap.remove(family);
3847       }
3848     }
3849   }
3850 
3851   @Override
3852   public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now)
3853       throws FailedSanityCheckException {
3854     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
3855       return;
3856     }
3857     long maxTs = now + timestampSlop;
3858     for (List<Cell> kvs : familyMap.values()) {
3859       assert kvs instanceof RandomAccess;
3860       int listSize  = kvs.size();
3861       for (int i=0; i < listSize; i++) {
3862         Cell cell = kvs.get(i);
3863         // see if the user-side TS is out of range. latest = server-side
3864         long ts = cell.getTimestamp();
3865         if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
3866           throw new FailedSanityCheckException("Timestamp for KV out of range "
3867               + cell + " (too.new=" + timestampSlop + ")");
3868         }
3869       }
3870     }
3871   }
3872 
3873   /**
3874    * Append the given map of family->edits to a WALEdit data structure.
3875    * This does not write to the WAL itself.
3876    * @param familyMap map of family->edits
3877    * @param walEdit the destination entry to append into
3878    */
3879   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
3880       WALEdit walEdit) {
3881     for (List<Cell> edits : familyMap.values()) {
3882       assert edits instanceof RandomAccess;
3883       int listSize = edits.size();
3884       for (int i=0; i < listSize; i++) {
3885         Cell cell = edits.get(i);
3886         walEdit.add(cell);
3887       }
3888     }
3889   }
3890 
3891   private void requestFlush() {
3892     if (this.rsServices == null) {
3893       return;
3894     }
3895     synchronized (writestate) {
3896       if (this.writestate.isFlushRequested()) {
3897         return;
3898       }
3899       writestate.flushRequested = true;
3900     }
3901     // Make request outside of synchronize block; HBASE-818.
3902     this.rsServices.getFlushRequester().requestFlush(this, false);
3903     if (LOG.isDebugEnabled()) {
3904       LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName());
3905     }
3906   }
3907 
3908   /*
3909    * @param size
3910    * @return True if size is over the flush threshold
3911    */
3912   private boolean isFlushSize(final long size) {
3913     return size > this.memstoreFlushSize;
3914   }
3915 
3916   /**
3917    * Read the edits put under this region by wal splitting process.  Put
3918    * the recovered edits back up into this region.
3919    *
3920    * <p>We can ignore any wal message that has a sequence ID that's equal to or
3921    * lower than minSeqId.  (Because we know such messages are already
3922    * reflected in the HFiles.)
3923    *
3924    * <p>While this is running we are putting pressure on memory yet we are
3925    * outside of our usual accounting because we are not yet an onlined region
3926    * (this stuff is being run as part of Region initialization).  This means
3927    * that if we're up against global memory limits, we'll not be flagged to flush
3928    * because we are not online. We can't be flushed by usual mechanisms anyways;
3929    * we're not yet online so our relative sequenceids are not yet aligned with
3930    * WAL sequenceids -- not till we come up online, post processing of split
3931    * edits.
3932    *
3933    * <p>But to help relieve memory pressure, at least manage our own heap size
3934    * flushing if are in excess of per-region limits.  Flushing, though, we have
3935    * to be careful and avoid using the regionserver/wal sequenceid.  Its running
3936    * on a different line to whats going on in here in this region context so if we
3937    * crashed replaying these edits, but in the midst had a flush that used the
3938    * regionserver wal with a sequenceid in excess of whats going on in here
3939    * in this region and with its split editlogs, then we could miss edits the
3940    * next time we go to recover. So, we have to flush inline, using seqids that
3941    * make sense in a this single region context only -- until we online.
3942    *
3943    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
3944    * the maxSeqId for the store to be applied, else its skipped.
3945    * @return the sequence id of the last edit added to this region out of the
3946    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3947    * @throws IOException
3948    */
3949   protected long replayRecoveredEditsIfAny(final Path regiondir,
3950       Map<byte[], Long> maxSeqIdInStores,
3951       final CancelableProgressable reporter, final MonitoredTask status)
3952       throws IOException {
3953     long minSeqIdForTheRegion = -1;
3954     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3955       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3956         minSeqIdForTheRegion = maxSeqIdInStore;
3957       }
3958     }
3959     long seqid = minSeqIdForTheRegion;
3960 
3961     FileSystem fs = this.fs.getFileSystem();
3962     NavigableSet<Path> files = WALSplitter.getSplitEditFilesSorted(fs, regiondir);
3963     if (LOG.isDebugEnabled()) {
3964       LOG.debug("Found " + (files == null ? 0 : files.size())
3965         + " recovered edits file(s) under " + regiondir);
3966     }
3967 
3968     if (files == null || files.isEmpty()) return seqid;
3969 
3970     for (Path edits: files) {
3971       if (edits == null || !fs.exists(edits)) {
3972         LOG.warn("Null or non-existent edits file: " + edits);
3973         continue;
3974       }
3975       if (isZeroLengthThenDelete(fs, edits)) continue;
3976 
3977       long maxSeqId;
3978       String fileName = edits.getName();
3979       maxSeqId = Math.abs(Long.parseLong(fileName));
3980       if (maxSeqId <= minSeqIdForTheRegion) {
3981         if (LOG.isDebugEnabled()) {
3982           String msg = "Maximum sequenceid for this wal is " + maxSeqId
3983             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3984             + ", skipped the whole file, path=" + edits;
3985           LOG.debug(msg);
3986         }
3987         continue;
3988       }
3989 
3990       try {
3991         // replay the edits. Replay can return -1 if everything is skipped, only update
3992         // if seqId is greater
3993         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
3994       } catch (IOException e) {
3995         boolean skipErrors = conf.getBoolean(
3996             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3997             conf.getBoolean(
3998                 "hbase.skip.errors",
3999                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
4000         if (conf.get("hbase.skip.errors") != null) {
4001           LOG.warn(
4002               "The property 'hbase.skip.errors' has been deprecated. Please use " +
4003               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
4004         }
4005         if (skipErrors) {
4006           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4007           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
4008               + "=true so continuing. Renamed " + edits +
4009               " as " + p, e);
4010         } else {
4011           throw e;
4012         }
4013       }
4014     }
4015     // The edits size added into rsAccounting during this replaying will not
4016     // be required any more. So just clear it.
4017     if (this.rsAccounting != null) {
4018       this.rsAccounting.clearRegionReplayEditsSize(getRegionInfo().getRegionName());
4019     }
4020     if (seqid > minSeqIdForTheRegion) {
4021       // Then we added some edits to memory. Flush and cleanup split edit files.
4022       internalFlushcache(null, seqid, stores.values(), status, false);
4023     }
4024     // Now delete the content of recovered edits.  We're done w/ them.
4025     if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
4026       // For debugging data loss issues!
4027       // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
4028       // column family. Have to fake out file type too by casting our recovered.edits as storefiles
4029       String fakeFamilyName = WALSplitter.getRegionDirRecoveredEditsDir(regiondir).getName();
4030       Set<StoreFile> fakeStoreFiles = new HashSet<StoreFile>(files.size());
4031       for (Path file: files) {
4032         fakeStoreFiles.add(new StoreFile(getRegionFileSystem().getFileSystem(), file, this.conf,
4033           null, null));
4034       }
4035       getRegionFileSystem().removeStoreFiles(fakeFamilyName, fakeStoreFiles);
4036     } else {
4037       for (Path file: files) {
4038         if (!fs.delete(file, false)) {
4039           LOG.error("Failed delete of " + file);
4040         } else {
4041           LOG.debug("Deleted recovered.edits file=" + file);
4042         }
4043       }
4044     }
4045     return seqid;
4046   }
4047 
4048   /*
4049    * @param edits File of recovered edits.
4050    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in wal
4051    * must be larger than this to be replayed for each store.
4052    * @param reporter
4053    * @return the sequence id of the last edit added to this region out of the
4054    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
4055    * @throws IOException
4056    */
4057   private long replayRecoveredEdits(final Path edits,
4058       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
4059     throws IOException {
4060     String msg = "Replaying edits from " + edits;
4061     LOG.info(msg);
4062     MonitoredTask status = TaskMonitor.get().createStatus(msg);
4063     FileSystem fs = this.fs.getFileSystem();
4064 
4065     status.setStatus("Opening recovered edits");
4066     WAL.Reader reader = null;
4067     try {
4068       reader = WALFactory.createReader(fs, edits, conf);
4069       long currentEditSeqId = -1;
4070       long currentReplaySeqId = -1;
4071       long firstSeqIdInLog = -1;
4072       long skippedEdits = 0;
4073       long editsCount = 0;
4074       long intervalEdits = 0;
4075       WAL.Entry entry;
4076       Store store = null;
4077       boolean reported_once = false;
4078       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
4079 
4080       try {
4081         // How many edits seen before we check elapsed time
4082         int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
4083         // How often to send a progress report (default 1/2 master timeout)
4084         int period = this.conf.getInt("hbase.hstore.report.period", 300000);
4085         long lastReport = EnvironmentEdgeManager.currentTime();
4086 
4087         while ((entry = reader.next()) != null) {
4088           WALKey key = entry.getKey();
4089           WALEdit val = entry.getEdit();
4090 
4091           if (ng != null) { // some test, or nonces disabled
4092             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
4093           }
4094 
4095           if (reporter != null) {
4096             intervalEdits += val.size();
4097             if (intervalEdits >= interval) {
4098               // Number of edits interval reached
4099               intervalEdits = 0;
4100               long cur = EnvironmentEdgeManager.currentTime();
4101               if (lastReport + period <= cur) {
4102                 status.setStatus("Replaying edits..." +
4103                     " skipped=" + skippedEdits +
4104                     " edits=" + editsCount);
4105                 // Timeout reached
4106                 if(!reporter.progress()) {
4107                   msg = "Progressable reporter failed, stopping replay";
4108                   LOG.warn(msg);
4109                   status.abort(msg);
4110                   throw new IOException(msg);
4111                 }
4112                 reported_once = true;
4113                 lastReport = cur;
4114               }
4115             }
4116           }
4117 
4118           if (firstSeqIdInLog == -1) {
4119             firstSeqIdInLog = key.getLogSeqNum();
4120           }
4121           if (currentEditSeqId > key.getLogSeqNum()) {
4122             // when this condition is true, it means we have a serious defect because we need to
4123             // maintain increasing SeqId for WAL edits per region
4124             LOG.error(getRegionInfo().getEncodedName() + " : "
4125                  + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
4126                 + "; edit=" + val);
4127           } else {
4128             currentEditSeqId = key.getLogSeqNum();
4129           }
4130           currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
4131             key.getOrigLogSeqNum() : currentEditSeqId;
4132 
4133           // Start coprocessor replay here. The coprocessor is for each WALEdit
4134           // instead of a KeyValue.
4135           if (coprocessorHost != null) {
4136             status.setStatus("Running pre-WAL-restore hook in coprocessors");
4137             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
4138               // if bypass this wal entry, ignore it ...
4139               continue;
4140             }
4141           }
4142           boolean checkRowWithinBoundary = false;
4143           // Check this edit is for this region.
4144           if (!Bytes.equals(key.getEncodedRegionName(),
4145               this.getRegionInfo().getEncodedNameAsBytes())) {
4146             checkRowWithinBoundary = true;
4147           }
4148 
4149           boolean flush = false;
4150           for (Cell cell: val.getCells()) {
4151             // Check this edit is for me. Also, guard against writing the special
4152             // METACOLUMN info such as HBASE::CACHEFLUSH entries
4153             if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
4154               // if region names don't match, skipp replaying compaction marker
4155               if (!checkRowWithinBoundary) {
4156                 //this is a special edit, we should handle it
4157                 CompactionDescriptor compaction = WALEdit.getCompaction(cell);
4158                 if (compaction != null) {
4159                   //replay the compaction
4160                   replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
4161                 }
4162               }
4163               skippedEdits++;
4164               continue;
4165             }
4166             // Figure which store the edit is meant for.
4167             if (store == null || !CellUtil.matchingFamily(cell, store.getFamily().getName())) {
4168               store = getStore(cell);
4169             }
4170             if (store == null) {
4171               // This should never happen.  Perhaps schema was changed between
4172               // crash and redeploy?
4173               LOG.warn("No family for " + cell);
4174               skippedEdits++;
4175               continue;
4176             }
4177             if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(),
4178               cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())) {
4179               LOG.warn("Row of " + cell + " is not within region boundary");
4180               skippedEdits++;
4181               continue;
4182             }
4183             // Now, figure if we should skip this edit.
4184             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
4185                 .getName())) {
4186               skippedEdits++;
4187               continue;
4188             }
4189             CellUtil.setSequenceId(cell, currentReplaySeqId);
4190 
4191             // Once we are over the limit, restoreEdit will keep returning true to
4192             // flush -- but don't flush until we've played all the kvs that make up
4193             // the WALEdit.
4194             flush |= restoreEdit(store, cell);
4195             editsCount++;
4196           }
4197           if (flush) {
4198             internalFlushcache(null, currentEditSeqId, stores.values(), status, false);
4199           }
4200 
4201           if (coprocessorHost != null) {
4202             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
4203           }
4204         }
4205       } catch (EOFException eof) {
4206         Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4207         msg = "Encountered EOF. Most likely due to Master failure during " +
4208             "wal splitting, so we have this data in another edit.  " +
4209             "Continuing, but renaming " + edits + " as " + p;
4210         LOG.warn(msg, eof);
4211         status.abort(msg);
4212       } catch (IOException ioe) {
4213         // If the IOE resulted from bad file format,
4214         // then this problem is idempotent and retrying won't help
4215         if (ioe.getCause() instanceof ParseException) {
4216           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4217           msg = "File corruption encountered!  " +
4218               "Continuing, but renaming " + edits + " as " + p;
4219           LOG.warn(msg, ioe);
4220           status.setStatus(msg);
4221         } else {
4222           status.abort(StringUtils.stringifyException(ioe));
4223           // other IO errors may be transient (bad network connection,
4224           // checksum exception on one datanode, etc).  throw & retry
4225           throw ioe;
4226         }
4227       }
4228       if (reporter != null && !reported_once) {
4229         reporter.progress();
4230       }
4231       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
4232         ", firstSequenceIdInLog=" + firstSeqIdInLog +
4233         ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
4234       status.markComplete(msg);
4235       LOG.debug(msg);
4236       return currentEditSeqId;
4237     } finally {
4238       status.cleanup();
4239       if (reader != null) {
4240          reader.close();
4241       }
4242     }
4243   }
4244 
4245   /**
4246    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
4247    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
4248    * See HBASE-2331.
4249    */
4250   void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
4251       boolean removeFiles, long replaySeqId)
4252       throws IOException {
4253     try {
4254       checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
4255         "Compaction marker from WAL ", compaction);
4256     } catch (WrongRegionException wre) {
4257       if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4258         // skip the compaction marker since it is not for this region
4259         return;
4260       }
4261       throw wre;
4262     }
4263 
4264     synchronized (writestate) {
4265       if (replaySeqId < lastReplayedOpenRegionSeqId) {
4266         LOG.warn(getRegionInfo().getEncodedName() + " : "
4267             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4268             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4269             + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4270         return;
4271       }
4272       if (replaySeqId < lastReplayedCompactionSeqId) {
4273         LOG.warn(getRegionInfo().getEncodedName() + " : "
4274             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4275             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4276             + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
4277         return;
4278       } else {
4279         lastReplayedCompactionSeqId = replaySeqId;
4280       }
4281 
4282       if (LOG.isDebugEnabled()) {
4283         LOG.debug(getRegionInfo().getEncodedName() + " : "
4284             + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
4285             + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
4286             + lastReplayedOpenRegionSeqId);
4287       }
4288 
4289       startRegionOperation(Operation.REPLAY_EVENT);
4290       try {
4291         Store store = this.getStore(compaction.getFamilyName().toByteArray());
4292         if (store == null) {
4293           LOG.warn(getRegionInfo().getEncodedName() + " : "
4294               + "Found Compaction WAL edit for deleted family:"
4295               + Bytes.toString(compaction.getFamilyName().toByteArray()));
4296           return;
4297         }
4298         store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
4299         logRegionFiles();
4300       } catch (FileNotFoundException ex) {
4301         LOG.warn(getRegionInfo().getEncodedName() + " : "
4302             + "At least one of the store files in compaction: "
4303             + TextFormat.shortDebugString(compaction)
4304             + " doesn't exist any more. Skip loading the file(s)", ex);
4305       } finally {
4306         closeRegionOperation(Operation.REPLAY_EVENT);
4307       }
4308     }
4309   }
4310 
4311   void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
4312     checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
4313       "Flush marker from WAL ", flush);
4314 
4315     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4316       return; // if primary nothing to do
4317     }
4318 
4319     if (LOG.isDebugEnabled()) {
4320       LOG.debug(getRegionInfo().getEncodedName() + " : "
4321           + "Replaying flush marker " + TextFormat.shortDebugString(flush));
4322     }
4323 
4324     startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
4325     try {
4326       FlushAction action = flush.getAction();
4327       switch (action) {
4328       case START_FLUSH:
4329         replayWALFlushStartMarker(flush);
4330         break;
4331       case COMMIT_FLUSH:
4332         replayWALFlushCommitMarker(flush);
4333         break;
4334       case ABORT_FLUSH:
4335         replayWALFlushAbortMarker(flush);
4336         break;
4337       case CANNOT_FLUSH:
4338         replayWALFlushCannotFlushMarker(flush, replaySeqId);
4339         break;
4340       default:
4341         LOG.warn(getRegionInfo().getEncodedName() + " : " +
4342           "Received a flush event with unknown action, ignoring. " +
4343           TextFormat.shortDebugString(flush));
4344         break;
4345       }
4346 
4347       logRegionFiles();
4348     } finally {
4349       closeRegionOperation(Operation.REPLAY_EVENT);
4350     }
4351   }
4352 
4353   /** Replay the flush marker from primary region by creating a corresponding snapshot of
4354    * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
4355    * edit (because the events may be coming out of order).
4356    */
4357   @VisibleForTesting
4358   PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
4359     long flushSeqId = flush.getFlushSequenceNumber();
4360 
4361     HashSet<Store> storesToFlush = new HashSet<Store>();
4362     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4363       byte[] family = storeFlush.getFamilyName().toByteArray();
4364       Store store = getStore(family);
4365       if (store == null) {
4366         LOG.warn(getRegionInfo().getEncodedName() + " : "
4367           + "Received a flush start marker from primary, but the family is not found. Ignoring"
4368           + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
4369         continue;
4370       }
4371       storesToFlush.add(store);
4372     }
4373 
4374     MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
4375 
4376     // we will use writestate as a coarse-grain lock for all the replay events
4377     // (flush, compaction, region open etc)
4378     synchronized (writestate) {
4379       try {
4380         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4381           LOG.warn(getRegionInfo().getEncodedName() + " : "
4382               + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4383               + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4384               + " of " + lastReplayedOpenRegionSeqId);
4385           return null;
4386         }
4387         if (numMutationsWithoutWAL.get() > 0) {
4388           numMutationsWithoutWAL.set(0);
4389           dataInMemoryWithoutWAL.set(0);
4390         }
4391 
4392         if (!writestate.flushing) {
4393           // we do not have an active snapshot and corresponding this.prepareResult. This means
4394           // we can just snapshot our memstores and continue as normal.
4395 
4396           // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
4397           PrepareFlushResult prepareResult = internalPrepareFlushCache(null,
4398             flushSeqId, storesToFlush, status, false);
4399           if (prepareResult.result == null) {
4400             // save the PrepareFlushResult so that we can use it later from commit flush
4401             this.writestate.flushing = true;
4402             this.prepareFlushResult = prepareResult;
4403             status.markComplete("Flush prepare successful");
4404             if (LOG.isDebugEnabled()) {
4405               LOG.debug(getRegionInfo().getEncodedName() + " : "
4406                   + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
4407             }
4408           } else {
4409             // special case empty memstore. We will still save the flush result in this case, since
4410             // our memstore ie empty, but the primary is still flushing
4411             if (prepareResult.getResult().getResult() ==
4412                   FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
4413               this.writestate.flushing = true;
4414               this.prepareFlushResult = prepareResult;
4415               if (LOG.isDebugEnabled()) {
4416                 LOG.debug(getRegionInfo().getEncodedName() + " : "
4417                   + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
4418               }
4419             }
4420             status.abort("Flush prepare failed with " + prepareResult.result);
4421             // nothing much to do. prepare flush failed because of some reason.
4422           }
4423           return prepareResult;
4424         } else {
4425           // we already have an active snapshot.
4426           if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
4427             // They define the same flush. Log and continue.
4428             LOG.warn(getRegionInfo().getEncodedName() + " : "
4429                 + "Received a flush prepare marker with the same seqId: " +
4430                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4431                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4432             // ignore
4433           } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
4434             // We received a flush with a smaller seqNum than what we have prepared. We can only
4435             // ignore this prepare flush request.
4436             LOG.warn(getRegionInfo().getEncodedName() + " : "
4437                 + "Received a flush prepare marker with a smaller seqId: " +
4438                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4439                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4440             // ignore
4441           } else {
4442             // We received a flush with a larger seqNum than what we have prepared
4443             LOG.warn(getRegionInfo().getEncodedName() + " : "
4444                 + "Received a flush prepare marker with a larger seqId: " +
4445                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4446                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4447             // We do not have multiple active snapshots in the memstore or a way to merge current
4448             // memstore snapshot with the contents and resnapshot for now. We cannot take
4449             // another snapshot and drop the previous one because that will cause temporary
4450             // data loss in the secondary. So we ignore this for now, deferring the resolution
4451             // to happen when we see the corresponding flush commit marker. If we have a memstore
4452             // snapshot with x, and later received another prepare snapshot with y (where x < y),
4453             // when we see flush commit for y, we will drop snapshot for x, and can also drop all
4454             // the memstore edits if everything in memstore is < y. This is the usual case for
4455             // RS crash + recovery where we might see consequtive prepare flush wal markers.
4456             // Otherwise, this will cause more memory to be used in secondary replica until a
4457             // further prapare + commit flush is seen and replayed.
4458           }
4459         }
4460       } finally {
4461         status.cleanup();
4462         writestate.notifyAll();
4463       }
4464     }
4465     return null;
4466   }
4467 
4468   @VisibleForTesting
4469   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4470     justification="Intentional; post memstore flush")
4471   void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
4472     MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
4473 
4474     // check whether we have the memstore snapshot with the corresponding seqId. Replay to
4475     // secondary region replicas are in order, except for when the region moves or then the
4476     // region server crashes. In those cases, we may receive replay requests out of order from
4477     // the original seqIds.
4478     synchronized (writestate) {
4479       try {
4480         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4481           LOG.warn(getRegionInfo().getEncodedName() + " : "
4482             + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4483             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4484             + " of " + lastReplayedOpenRegionSeqId);
4485           return;
4486         }
4487 
4488         if (writestate.flushing) {
4489           PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
4490           if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
4491             if (LOG.isDebugEnabled()) {
4492               LOG.debug(getRegionInfo().getEncodedName() + " : "
4493                   + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4494                   + " and a previous prepared snapshot was found");
4495             }
4496             // This is the regular case where we received commit flush after prepare flush
4497             // corresponding to the same seqId.
4498             replayFlushInStores(flush, prepareFlushResult, true);
4499 
4500             // Set down the memstore size by amount of flush.
4501             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4502 
4503             this.prepareFlushResult = null;
4504             writestate.flushing = false;
4505           } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
4506             // This should not happen normally. However, lets be safe and guard against these cases
4507             // we received a flush commit with a smaller seqId than what we have prepared
4508             // we will pick the flush file up from this commit (if we have not seen it), but we
4509             // will not drop the memstore
4510             LOG.warn(getRegionInfo().getEncodedName() + " : "
4511                 + "Received a flush commit marker with smaller seqId: "
4512                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
4513                 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
4514                 +"  prepared memstore snapshot");
4515             replayFlushInStores(flush, prepareFlushResult, false);
4516 
4517             // snapshot is not dropped, so memstore sizes should not be decremented
4518             // we still have the prepared snapshot, flushing should still be true
4519           } else {
4520             // This should not happen normally. However, lets be safe and guard against these cases
4521             // we received a flush commit with a larger seqId than what we have prepared
4522             // we will pick the flush file for this. We will also obtain the updates lock and
4523             // look for contents of the memstore to see whether we have edits after this seqId.
4524             // If not, we will drop all the memstore edits and the snapshot as well.
4525             LOG.warn(getRegionInfo().getEncodedName() + " : "
4526                 + "Received a flush commit marker with larger seqId: "
4527                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
4528                 prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
4529                 +" memstore snapshot");
4530 
4531             replayFlushInStores(flush, prepareFlushResult, true);
4532 
4533             // Set down the memstore size by amount of flush.
4534             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4535 
4536             // Inspect the memstore contents to see whether the memstore contains only edits
4537             // with seqId smaller than the flush seqId. If so, we can discard those edits.
4538             dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4539 
4540             this.prepareFlushResult = null;
4541             writestate.flushing = false;
4542           }
4543           // If we were waiting for observing a flush or region opening event for not showing
4544           // partial data after a secondary region crash, we can allow reads now. We can only make
4545           // sure that we are not showing partial data (for example skipping some previous edits)
4546           // until we observe a full flush start and flush commit. So if we were not able to find
4547           // a previous flush we will not enable reads now.
4548           this.setReadsEnabled(true);
4549         } else {
4550           LOG.warn(getRegionInfo().getEncodedName() + " : "
4551               + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4552               + ", but no previous prepared snapshot was found");
4553           // There is no corresponding prepare snapshot from before.
4554           // We will pick up the new flushed file
4555           replayFlushInStores(flush, null, false);
4556 
4557           // Inspect the memstore contents to see whether the memstore contains only edits
4558           // with seqId smaller than the flush seqId. If so, we can discard those edits.
4559           dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4560         }
4561 
4562         status.markComplete("Flush commit successful");
4563 
4564         // Update the last flushed sequence id for region.
4565         this.maxFlushedSeqId = flush.getFlushSequenceNumber();
4566 
4567         // advance the mvcc read point so that the new flushed file is visible.
4568         mvcc.advanceTo(flush.getFlushSequenceNumber());
4569 
4570       } catch (FileNotFoundException ex) {
4571         LOG.warn(getRegionInfo().getEncodedName() + " : "
4572             + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
4573             + " doesn't exist any more. Skip loading the file(s)", ex);
4574       }
4575       finally {
4576         status.cleanup();
4577         writestate.notifyAll();
4578       }
4579     }
4580 
4581     // C. Finally notify anyone waiting on memstore to clear:
4582     // e.g. checkResources().
4583     synchronized (this) {
4584       notifyAll(); // FindBugs NN_NAKED_NOTIFY
4585     }
4586   }
4587 
4588   /**
4589    * Replays the given flush descriptor by opening the flush files in stores and dropping the
4590    * memstore snapshots if requested.
4591    * @param flush
4592    * @param prepareFlushResult
4593    * @param dropMemstoreSnapshot
4594    * @throws IOException
4595    */
4596   private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
4597       boolean dropMemstoreSnapshot)
4598       throws IOException {
4599     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4600       byte[] family = storeFlush.getFamilyName().toByteArray();
4601       Store store = getStore(family);
4602       if (store == null) {
4603         LOG.warn(getRegionInfo().getEncodedName() + " : "
4604             + "Received a flush commit marker from primary, but the family is not found."
4605             + "Ignoring StoreFlushDescriptor:" + storeFlush);
4606         continue;
4607       }
4608       List<String> flushFiles = storeFlush.getFlushOutputList();
4609       StoreFlushContext ctx = null;
4610       long startTime = EnvironmentEdgeManager.currentTime();
4611       if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
4612         ctx = store.createFlushContext(flush.getFlushSequenceNumber());
4613       } else {
4614         ctx = prepareFlushResult.storeFlushCtxs.get(family);
4615         startTime = prepareFlushResult.startTime;
4616       }
4617 
4618       if (ctx == null) {
4619         LOG.warn(getRegionInfo().getEncodedName() + " : "
4620             + "Unexpected: flush commit marker received from store "
4621             + Bytes.toString(family) + " but no associated flush context. Ignoring");
4622         continue;
4623       }
4624 
4625       ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
4626 
4627       // Record latest flush time
4628       this.lastStoreFlushTimeMap.put(store, startTime);
4629     }
4630   }
4631 
4632   /**
4633    * Drops the memstore contents after replaying a flush descriptor or region open event replay
4634    * if the memstore edits have seqNums smaller than the given seq id
4635    * @throws IOException
4636    */
4637   private long dropMemstoreContentsForSeqId(long seqId, Store store) throws IOException {
4638     long totalFreedSize = 0;
4639     this.updatesLock.writeLock().lock();
4640     try {
4641 
4642       long currentSeqId = mvcc.getReadPoint();
4643       if (seqId >= currentSeqId) {
4644         // then we can drop the memstore contents since everything is below this seqId
4645         LOG.info(getRegionInfo().getEncodedName() + " : "
4646             + "Dropping memstore contents as well since replayed flush seqId: "
4647             + seqId + " is greater than current seqId:" + currentSeqId);
4648 
4649         // Prepare flush (take a snapshot) and then abort (drop the snapshot)
4650         if (store == null) {
4651           for (Store s : stores.values()) {
4652             totalFreedSize += doDropStoreMemstoreContentsForSeqId(s, currentSeqId);
4653           }
4654         } else {
4655           totalFreedSize += doDropStoreMemstoreContentsForSeqId(store, currentSeqId);
4656         }
4657       } else {
4658         LOG.info(getRegionInfo().getEncodedName() + " : "
4659             + "Not dropping memstore contents since replayed flush seqId: "
4660             + seqId + " is smaller than current seqId:" + currentSeqId);
4661       }
4662     } finally {
4663       this.updatesLock.writeLock().unlock();
4664     }
4665     return totalFreedSize;
4666   }
4667 
4668   private long doDropStoreMemstoreContentsForSeqId(Store s, long currentSeqId) throws IOException {
4669     long snapshotSize = s.getFlushableSize();
4670     this.addAndGetGlobalMemstoreSize(-snapshotSize);
4671     StoreFlushContext ctx = s.createFlushContext(currentSeqId);
4672     ctx.prepare();
4673     ctx.abort();
4674     return snapshotSize;
4675   }
4676 
4677   private void replayWALFlushAbortMarker(FlushDescriptor flush) {
4678     // nothing to do for now. A flush abort will cause a RS abort which means that the region
4679     // will be opened somewhere else later. We will see the region open event soon, and replaying
4680     // that will drop the snapshot
4681   }
4682 
4683   private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
4684     synchronized (writestate) {
4685       if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
4686         LOG.warn(getRegionInfo().getEncodedName() + " : "
4687           + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4688           + " because its sequence id " + replaySeqId + " is smaller than this regions "
4689           + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4690         return;
4691       }
4692 
4693       // If we were waiting for observing a flush or region opening event for not showing partial
4694       // data after a secondary region crash, we can allow reads now. This event means that the
4695       // primary was not able to flush because memstore is empty when we requested flush. By the
4696       // time we observe this, we are guaranteed to have up to date seqId with our previous
4697       // assignment.
4698       this.setReadsEnabled(true);
4699     }
4700   }
4701 
4702   @VisibleForTesting
4703   PrepareFlushResult getPrepareFlushResult() {
4704     return prepareFlushResult;
4705   }
4706 
4707   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4708       justification="Intentional; cleared the memstore")
4709   void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
4710     checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
4711       "RegionEvent marker from WAL ", regionEvent);
4712 
4713     startRegionOperation(Operation.REPLAY_EVENT);
4714     try {
4715       if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4716         return; // if primary nothing to do
4717       }
4718 
4719       if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
4720         // nothing to do on REGION_CLOSE for now.
4721         return;
4722       }
4723       if (regionEvent.getEventType() != EventType.REGION_OPEN) {
4724         LOG.warn(getRegionInfo().getEncodedName() + " : "
4725             + "Unknown region event received, ignoring :"
4726             + TextFormat.shortDebugString(regionEvent));
4727         return;
4728       }
4729 
4730       if (LOG.isDebugEnabled()) {
4731         LOG.debug(getRegionInfo().getEncodedName() + " : "
4732           + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
4733       }
4734 
4735       // we will use writestate as a coarse-grain lock for all the replay events
4736       synchronized (writestate) {
4737         // Replication can deliver events out of order when primary region moves or the region
4738         // server crashes, since there is no coordination between replication of different wal files
4739         // belonging to different region servers. We have to safe guard against this case by using
4740         // region open event's seqid. Since this is the first event that the region puts (after
4741         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4742         // smaller than this seqId
4743         if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
4744           this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
4745         } else {
4746           LOG.warn(getRegionInfo().getEncodedName() + " : "
4747             + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
4748             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4749             + " of " + lastReplayedOpenRegionSeqId);
4750           return;
4751         }
4752 
4753         // region open lists all the files that the region has at the time of the opening. Just pick
4754         // all the files and drop prepared flushes and empty memstores
4755         for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
4756           // stores of primary may be different now
4757           byte[] family = storeDescriptor.getFamilyName().toByteArray();
4758           Store store = getStore(family);
4759           if (store == null) {
4760             LOG.warn(getRegionInfo().getEncodedName() + " : "
4761                 + "Received a region open marker from primary, but the family is not found. "
4762                 + "Ignoring. StoreDescriptor:" + storeDescriptor);
4763             continue;
4764           }
4765 
4766           long storeSeqId = store.getMaxSequenceId();
4767           List<String> storeFiles = storeDescriptor.getStoreFileList();
4768           try {
4769             store.refreshStoreFiles(storeFiles); // replace the files with the new ones
4770           } catch (FileNotFoundException ex) {
4771             LOG.warn(getRegionInfo().getEncodedName() + " : "
4772                     + "At least one of the store files: " + storeFiles
4773                     + " doesn't exist any more. Skip loading the file(s)", ex);
4774             continue;
4775           }
4776           if (store.getMaxSequenceId() != storeSeqId) {
4777             // Record latest flush time if we picked up new files
4778             lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
4779           }
4780 
4781           if (writestate.flushing) {
4782             // only drop memstore snapshots if they are smaller than last flush for the store
4783             if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
4784               StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4785                   null : this.prepareFlushResult.storeFlushCtxs.get(family);
4786               if (ctx != null) {
4787                 long snapshotSize = store.getFlushableSize();
4788                 ctx.abort();
4789                 this.addAndGetGlobalMemstoreSize(-snapshotSize);
4790                 this.prepareFlushResult.storeFlushCtxs.remove(family);
4791               }
4792             }
4793           }
4794 
4795           // Drop the memstore contents if they are now smaller than the latest seen flushed file
4796           dropMemstoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
4797           if (storeSeqId > this.maxFlushedSeqId) {
4798             this.maxFlushedSeqId = storeSeqId;
4799           }
4800         }
4801 
4802         // if all stores ended up dropping their snapshots, we can safely drop the
4803         // prepareFlushResult
4804         dropPrepareFlushIfPossible();
4805 
4806         // advance the mvcc read point so that the new flushed file is visible.
4807         mvcc.await();
4808 
4809         // If we were waiting for observing a flush or region opening event for not showing partial
4810         // data after a secondary region crash, we can allow reads now.
4811         this.setReadsEnabled(true);
4812 
4813         // C. Finally notify anyone waiting on memstore to clear:
4814         // e.g. checkResources().
4815         synchronized (this) {
4816           notifyAll(); // FindBugs NN_NAKED_NOTIFY
4817         }
4818       }
4819       logRegionFiles();
4820     } finally {
4821       closeRegionOperation(Operation.REPLAY_EVENT);
4822     }
4823   }
4824 
4825   void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
4826     checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
4827       "BulkLoad marker from WAL ", bulkLoadEvent);
4828 
4829     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4830       return; // if primary nothing to do
4831     }
4832 
4833     if (LOG.isDebugEnabled()) {
4834       LOG.debug(getRegionInfo().getEncodedName() + " : "
4835               +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
4836     }
4837     // check if multiple families involved
4838     boolean multipleFamilies = false;
4839     byte[] family = null;
4840     for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4841       byte[] fam = storeDescriptor.getFamilyName().toByteArray();
4842       if (family == null) {
4843         family = fam;
4844       } else if (!Bytes.equals(family, fam)) {
4845         multipleFamilies = true;
4846         break;
4847       }
4848     }
4849 
4850     startBulkRegionOperation(multipleFamilies);
4851     try {
4852       // we will use writestate as a coarse-grain lock for all the replay events
4853       synchronized (writestate) {
4854         // Replication can deliver events out of order when primary region moves or the region
4855         // server crashes, since there is no coordination between replication of different wal files
4856         // belonging to different region servers. We have to safe guard against this case by using
4857         // region open event's seqid. Since this is the first event that the region puts (after
4858         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4859         // smaller than this seqId
4860         if (bulkLoadEvent.getBulkloadSeqNum() >= 0
4861             && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
4862           LOG.warn(getRegionInfo().getEncodedName() + " : "
4863               + "Skipping replaying bulkload event :"
4864               + TextFormat.shortDebugString(bulkLoadEvent)
4865               + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
4866               + " =" + lastReplayedOpenRegionSeqId);
4867 
4868           return;
4869         }
4870 
4871         for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4872           // stores of primary may be different now
4873           family = storeDescriptor.getFamilyName().toByteArray();
4874           Store store = getStore(family);
4875           if (store == null) {
4876             LOG.warn(getRegionInfo().getEncodedName() + " : "
4877                     + "Received a bulk load marker from primary, but the family is not found. "
4878                     + "Ignoring. StoreDescriptor:" + storeDescriptor);
4879             continue;
4880           }
4881 
4882           List<String> storeFiles = storeDescriptor.getStoreFileList();
4883           for (String storeFile : storeFiles) {
4884             StoreFileInfo storeFileInfo = null;
4885             try {
4886               storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
4887               store.bulkLoadHFile(storeFileInfo);
4888             } catch(FileNotFoundException ex) {
4889               LOG.warn(getRegionInfo().getEncodedName() + " : "
4890                       + ((storeFileInfo != null) ? storeFileInfo.toString() :
4891                             (new Path(Bytes.toString(family), storeFile)).toString())
4892                       + " doesn't exist any more. Skip loading the file");
4893             }
4894           }
4895         }
4896       }
4897       if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
4898         mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum());
4899       }
4900     } finally {
4901       closeBulkRegionOperation();
4902     }
4903   }
4904 
4905   /**
4906    * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
4907    */
4908   private void dropPrepareFlushIfPossible() {
4909     if (writestate.flushing) {
4910       boolean canDrop = true;
4911       if (prepareFlushResult.storeFlushCtxs != null) {
4912         for (Entry<byte[], StoreFlushContext> entry
4913             : prepareFlushResult.storeFlushCtxs.entrySet()) {
4914           Store store = getStore(entry.getKey());
4915           if (store == null) {
4916             continue;
4917           }
4918           if (store.getSnapshotSize() > 0) {
4919             canDrop = false;
4920             break;
4921           }
4922         }
4923       }
4924 
4925       // this means that all the stores in the region has finished flushing, but the WAL marker
4926       // may not have been written or we did not receive it yet.
4927       if (canDrop) {
4928         writestate.flushing = false;
4929         this.prepareFlushResult = null;
4930       }
4931     }
4932   }
4933 
4934   @Override
4935   public boolean refreshStoreFiles() throws IOException {
4936     return refreshStoreFiles(false);
4937   }
4938 
4939   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4940       justification="Notify is about post replay. Intentional")
4941   protected boolean refreshStoreFiles(boolean force) throws IOException {
4942     if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4943       return false; // if primary nothing to do
4944     }
4945 
4946     if (LOG.isDebugEnabled()) {
4947       LOG.debug(getRegionInfo().getEncodedName() + " : "
4948           + "Refreshing store files to see whether we can free up memstore");
4949     }
4950 
4951     long totalFreedSize = 0;
4952 
4953     long smallestSeqIdInStores = Long.MAX_VALUE;
4954 
4955     startRegionOperation(); // obtain region close lock
4956     try {
4957       synchronized (writestate) {
4958         for (Store store : getStores()) {
4959           // TODO: some stores might see new data from flush, while others do not which
4960           // MIGHT break atomic edits across column families.
4961           long maxSeqIdBefore = store.getMaxSequenceId();
4962 
4963           // refresh the store files. This is similar to observing a region open wal marker.
4964           store.refreshStoreFiles();
4965 
4966           long storeSeqId = store.getMaxSequenceId();
4967           if (storeSeqId < smallestSeqIdInStores) {
4968             smallestSeqIdInStores = storeSeqId;
4969           }
4970 
4971           // see whether we can drop the memstore or the snapshot
4972           if (storeSeqId > maxSeqIdBefore) {
4973 
4974             if (writestate.flushing) {
4975               // only drop memstore snapshots if they are smaller than last flush for the store
4976               if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
4977                 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4978                     null : this.prepareFlushResult.storeFlushCtxs.get(store.getFamily().getName());
4979                 if (ctx != null) {
4980                   long snapshotSize = store.getFlushableSize();
4981                   ctx.abort();
4982                   this.addAndGetGlobalMemstoreSize(-snapshotSize);
4983                   this.prepareFlushResult.storeFlushCtxs.remove(store.getFamily().getName());
4984                   totalFreedSize += snapshotSize;
4985                 }
4986               }
4987             }
4988 
4989             // Drop the memstore contents if they are now smaller than the latest seen flushed file
4990             totalFreedSize += dropMemstoreContentsForSeqId(storeSeqId, store);
4991           }
4992         }
4993 
4994         // if all stores ended up dropping their snapshots, we can safely drop the
4995         // prepareFlushResult
4996         dropPrepareFlushIfPossible();
4997 
4998         // advance the mvcc read point so that the new flushed files are visible.
4999           // either greater than flush seq number or they were already picked up via flush.
5000           for (Store s : getStores()) {
5001             mvcc.advanceTo(s.getMaxMemstoreTS());
5002           }
5003 
5004 
5005         // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
5006         // skip all edits that are to be replayed in the future with that has a smaller seqId
5007         // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
5008         // that we have picked the flush files for
5009         if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
5010           this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
5011         }
5012       }
5013       // C. Finally notify anyone waiting on memstore to clear:
5014       // e.g. checkResources().
5015       synchronized (this) {
5016         notifyAll(); // FindBugs NN_NAKED_NOTIFY
5017       }
5018       return totalFreedSize > 0;
5019     } finally {
5020       closeRegionOperation();
5021     }
5022   }
5023 
5024   private void logRegionFiles() {
5025     if (LOG.isTraceEnabled()) {
5026       LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
5027       for (Store s : stores.values()) {
5028         Collection<StoreFile> storeFiles = s.getStorefiles();
5029         if (storeFiles == null) continue;
5030         for (StoreFile sf : storeFiles) {
5031           LOG.trace(getRegionInfo().getEncodedName() + " : " + sf);
5032         }
5033       }
5034     }
5035   }
5036 
5037   /** Checks whether the given regionName is either equal to our region, or that
5038    * the regionName is the primary region to our corresponding range for the secondary replica.
5039    */
5040   private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
5041       throws WrongRegionException {
5042     if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
5043       return;
5044     }
5045 
5046     if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
5047         Bytes.equals(encodedRegionName,
5048           this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
5049       return;
5050     }
5051 
5052     throw new WrongRegionException(exceptionMsg + payload
5053       + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
5054       + " does not match this region: " + this.getRegionInfo());
5055   }
5056 
5057   /**
5058    * Used by tests
5059    * @param s Store to add edit too.
5060    * @param cell Cell to add.
5061    * @return True if we should flush.
5062    */
5063   protected boolean restoreEdit(final Store s, final Cell cell) {
5064     long kvSize = s.add(cell);
5065     if (this.rsAccounting != null) {
5066       rsAccounting.addAndGetRegionReplayEditsSize(getRegionInfo().getRegionName(), kvSize);
5067     }
5068     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
5069   }
5070 
5071   /*
5072    * @param fs
5073    * @param p File to check.
5074    * @return True if file was zero-length (and if so, we'll delete it in here).
5075    * @throws IOException
5076    */
5077   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
5078       throws IOException {
5079     FileStatus stat = fs.getFileStatus(p);
5080     if (stat.getLen() > 0) return false;
5081     LOG.warn("File " + p + " is zero-length, deleting.");
5082     fs.delete(p, false);
5083     return true;
5084   }
5085 
5086   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
5087     if (family.isMobEnabled()) {
5088       if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
5089         throw new IOException("A minimum HFile version of "
5090             + HFile.MIN_FORMAT_VERSION_WITH_TAGS
5091             + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY
5092             + " accordingly.");
5093       }
5094       return new HMobStore(this, family, this.conf);
5095     }
5096     return new HStore(this, family, this.conf);
5097   }
5098 
5099   @Override
5100   public Store getStore(final byte[] column) {
5101     return this.stores.get(column);
5102   }
5103 
5104   /**
5105    * Return HStore instance. Does not do any copy: as the number of store is limited, we
5106    *  iterate on the list.
5107    */
5108   private Store getStore(Cell cell) {
5109     for (Map.Entry<byte[], Store> famStore : stores.entrySet()) {
5110       if (Bytes.equals(
5111           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
5112           famStore.getKey(), 0, famStore.getKey().length)) {
5113         return famStore.getValue();
5114       }
5115     }
5116 
5117     return null;
5118   }
5119 
5120   @Override
5121   public List<Store> getStores() {
5122     List<Store> list = new ArrayList<Store>(stores.size());
5123     list.addAll(stores.values());
5124     return list;
5125   }
5126 
5127   @Override
5128   public List<String> getStoreFileList(final byte [][] columns)
5129     throws IllegalArgumentException {
5130     List<String> storeFileNames = new ArrayList<String>();
5131     synchronized(closeLock) {
5132       for(byte[] column : columns) {
5133         Store store = this.stores.get(column);
5134         if (store == null) {
5135           throw new IllegalArgumentException("No column family : " +
5136               new String(column) + " available");
5137         }
5138         Collection<StoreFile> storeFiles = store.getStorefiles();
5139         if (storeFiles == null) continue;
5140         for (StoreFile storeFile: storeFiles) {
5141           storeFileNames.add(storeFile.getPath().toString());
5142         }
5143 
5144         logRegionFiles();
5145       }
5146     }
5147     return storeFileNames;
5148   }
5149 
5150   //////////////////////////////////////////////////////////////////////////////
5151   // Support code
5152   //////////////////////////////////////////////////////////////////////////////
5153 
5154   /** Make sure this is a valid row for the HRegion */
5155   void checkRow(final byte [] row, String op) throws IOException {
5156     if (!rowIsInRange(getRegionInfo(), row)) {
5157       throw new WrongRegionException("Requested row out of range for " +
5158           op + " on HRegion " + this + ", startKey='" +
5159           Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" +
5160           Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" +
5161           Bytes.toStringBinary(row) + "'");
5162     }
5163   }
5164 
5165 
5166   /**
5167    * Get an exclusive ( write lock ) lock on a given row.
5168    * @param row Which row to lock.
5169    * @return A locked RowLock. The lock is exclusive and already aqquired.
5170    * @throws IOException
5171    */
5172   public RowLock getRowLock(byte[] row) throws IOException {
5173     return getRowLock(row, false);
5174   }
5175 
5176   /**
5177    *
5178    * Get a row lock for the specified row. All locks are reentrant.
5179    *
5180    * Before calling this function make sure that a region operation has already been
5181    * started (the calling thread has already acquired the region-close-guard lock).
5182    * @param row The row actions will be performed against
5183    * @param readLock is the lock reader or writer. True indicates that a non-exlcusive
5184    *                 lock is requested
5185    */
5186   @Override
5187   public RowLock getRowLock(byte[] row, boolean readLock) throws IOException {
5188     // Make sure the row is inside of this region before getting the lock for it.
5189     checkRow(row, "row lock");
5190     // create an object to use a a key in the row lock map
5191     HashedBytes rowKey = new HashedBytes(row);
5192 
5193     RowLockContext rowLockContext = null;
5194     RowLockImpl result = null;
5195     TraceScope traceScope = null;
5196 
5197     // If we're tracing start a span to show how long this took.
5198     if (Trace.isTracing()) {
5199       traceScope = Trace.startSpan("HRegion.getRowLock");
5200       traceScope.getSpan().addTimelineAnnotation("Getting a " + (readLock?"readLock":"writeLock"));
5201     }
5202 
5203     try {
5204       // Keep trying until we have a lock or error out.
5205       // TODO: do we need to add a time component here?
5206       while (result == null) {
5207 
5208         // Try adding a RowLockContext to the lockedRows.
5209         // If we can add it then there's no other transactions currently running.
5210         rowLockContext = new RowLockContext(rowKey);
5211         RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
5212 
5213         // if there was a running transaction then there's already a context.
5214         if (existingContext != null) {
5215           rowLockContext = existingContext;
5216         }
5217 
5218         // Now try an get the lock.
5219         //
5220         // This can fail as
5221         if (readLock) {
5222           result = rowLockContext.newReadLock();
5223         } else {
5224           result = rowLockContext.newWriteLock();
5225         }
5226       }
5227       if (!result.getLock().tryLock(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
5228         if (traceScope != null) {
5229           traceScope.getSpan().addTimelineAnnotation("Failed to get row lock");
5230         }
5231         result = null;
5232         // Clean up the counts just in case this was the thing keeping the context alive.
5233         rowLockContext.cleanUp();
5234         throw new IOException("Timed out waiting for lock for row: " + rowKey);
5235       }
5236       return result;
5237     } catch (InterruptedException ie) {
5238       LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
5239       InterruptedIOException iie = new InterruptedIOException();
5240       iie.initCause(ie);
5241       if (traceScope != null) {
5242         traceScope.getSpan().addTimelineAnnotation("Interrupted exception getting row lock");
5243       }
5244       Thread.currentThread().interrupt();
5245       throw iie;
5246     } finally {
5247       if (traceScope != null) {
5248         traceScope.close();
5249       }
5250     }
5251   }
5252 
5253   @Override
5254   public void releaseRowLocks(List<RowLock> rowLocks) {
5255     if (rowLocks != null) {
5256       for (RowLock rowLock : rowLocks) {
5257         rowLock.release();
5258       }
5259       rowLocks.clear();
5260     }
5261   }
5262 
5263   @VisibleForTesting
5264   class RowLockContext {
5265     private final HashedBytes row;
5266     final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true);
5267     final AtomicBoolean usable = new AtomicBoolean(true);
5268     final AtomicInteger count = new AtomicInteger(0);
5269     final Object lock = new Object();
5270 
5271     RowLockContext(HashedBytes row) {
5272       this.row = row;
5273     }
5274 
5275     RowLockImpl newWriteLock() {
5276       Lock l = readWriteLock.writeLock();
5277       return getRowLock(l);
5278     }
5279     RowLockImpl newReadLock() {
5280       Lock l = readWriteLock.readLock();
5281       return getRowLock(l);
5282     }
5283 
5284     private RowLockImpl getRowLock(Lock l) {
5285       count.incrementAndGet();
5286       synchronized (lock) {
5287         if (usable.get()) {
5288           return new RowLockImpl(this, l);
5289         } else {
5290           return null;
5291         }
5292       }
5293     }
5294 
5295     void cleanUp() {
5296       long c = count.decrementAndGet();
5297       if (c <= 0) {
5298         synchronized (lock) {
5299           if (count.get() <= 0 ){
5300             usable.set(false);
5301             RowLockContext removed = lockedRows.remove(row);
5302             assert removed == this: "we should never remove a different context";
5303           }
5304         }
5305       }
5306     }
5307 
5308     @Override
5309     public String toString() {
5310       return "RowLockContext{" +
5311           "row=" + row +
5312           ", readWriteLock=" + readWriteLock +
5313           ", count=" + count +
5314           '}';
5315     }
5316   }
5317 
5318   /**
5319    * Class used to represent a lock on a row.
5320    */
5321   public static class RowLockImpl implements RowLock {
5322     private final RowLockContext context;
5323     private final Lock lock;
5324 
5325     public RowLockImpl(RowLockContext context, Lock lock) {
5326       this.context = context;
5327       this.lock = lock;
5328     }
5329 
5330     public Lock getLock() {
5331       return lock;
5332     }
5333 
5334     @VisibleForTesting
5335     public RowLockContext getContext() {
5336       return context;
5337     }
5338 
5339     @Override
5340     public void release() {
5341       lock.unlock();
5342       context.cleanUp();
5343     }
5344 
5345     @Override
5346     public String toString() {
5347       return "RowLockImpl{" +
5348           "context=" + context +
5349           ", lock=" + lock +
5350           '}';
5351     }
5352   }
5353 
5354   /**
5355    * Determines whether multiple column families are present
5356    * Precondition: familyPaths is not null
5357    *
5358    * @param familyPaths List of (column family, hfilePath)
5359    */
5360   private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) {
5361     boolean multipleFamilies = false;
5362     byte[] family = null;
5363     for (Pair<byte[], String> pair : familyPaths) {
5364       byte[] fam = pair.getFirst();
5365       if (family == null) {
5366         family = fam;
5367       } else if (!Bytes.equals(family, fam)) {
5368         multipleFamilies = true;
5369         break;
5370       }
5371     }
5372     return multipleFamilies;
5373   }
5374 
5375   @Override
5376   public boolean bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, boolean assignSeqId,
5377       BulkLoadListener bulkLoadListener) throws IOException {
5378     long seqId = -1;
5379     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
5380     Preconditions.checkNotNull(familyPaths);
5381     // we need writeLock for multi-family bulk load
5382     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
5383     try {
5384       this.writeRequestsCount.increment();
5385 
5386       // There possibly was a split that happened between when the split keys
5387       // were gathered and before the HRegion's write lock was taken.  We need
5388       // to validate the HFile region before attempting to bulk load all of them
5389       List<IOException> ioes = new ArrayList<IOException>();
5390       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
5391       for (Pair<byte[], String> p : familyPaths) {
5392         byte[] familyName = p.getFirst();
5393         String path = p.getSecond();
5394 
5395         Store store = getStore(familyName);
5396         if (store == null) {
5397           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
5398               "No such column family " + Bytes.toStringBinary(familyName));
5399           ioes.add(ioe);
5400         } else {
5401           try {
5402             store.assertBulkLoadHFileOk(new Path(path));
5403           } catch (WrongRegionException wre) {
5404             // recoverable (file doesn't fit in region)
5405             failures.add(p);
5406           } catch (IOException ioe) {
5407             // unrecoverable (hdfs problem)
5408             ioes.add(ioe);
5409           }
5410         }
5411       }
5412 
5413       // validation failed because of some sort of IO problem.
5414       if (ioes.size() != 0) {
5415         IOException e = MultipleIOException.createIOException(ioes);
5416         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
5417         throw e;
5418       }
5419 
5420       // validation failed, bail out before doing anything permanent.
5421       if (failures.size() != 0) {
5422         StringBuilder list = new StringBuilder();
5423         for (Pair<byte[], String> p : failures) {
5424           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
5425               .append(p.getSecond());
5426         }
5427         // problem when validating
5428         LOG.warn("There was a recoverable bulk load failure likely due to a" +
5429             " split.  These (family, HFile) pairs were not loaded: " + list);
5430         return false;
5431       }
5432 
5433       // We need to assign a sequential ID that's in between two memstores in order to preserve
5434       // the guarantee that all the edits lower than the highest sequential ID from all the
5435       // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
5436       // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
5437       // a sequence id that we can be sure is beyond the last hfile written).
5438       if (assignSeqId) {
5439         FlushResult fs = flushcache(true, false);
5440         if (fs.isFlushSucceeded()) {
5441           seqId = ((FlushResultImpl)fs).flushSequenceId;
5442         } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
5443           seqId = ((FlushResultImpl)fs).flushSequenceId;
5444         } else {
5445           throw new IOException("Could not bulk load with an assigned sequential ID because the "+
5446             "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason);
5447         }
5448       }
5449 
5450       for (Pair<byte[], String> p : familyPaths) {
5451         byte[] familyName = p.getFirst();
5452         String path = p.getSecond();
5453         Store store = getStore(familyName);
5454         try {
5455           String finalPath = path;
5456           if (bulkLoadListener != null) {
5457             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
5458           }
5459           Path commitedStoreFile = store.bulkLoadHFile(finalPath, seqId);
5460 
5461           if(storeFiles.containsKey(familyName)) {
5462             storeFiles.get(familyName).add(commitedStoreFile);
5463           } else {
5464             List<Path> storeFileNames = new ArrayList<Path>();
5465             storeFileNames.add(commitedStoreFile);
5466             storeFiles.put(familyName, storeFileNames);
5467           }
5468           if (bulkLoadListener != null) {
5469             bulkLoadListener.doneBulkLoad(familyName, path);
5470           }
5471         } catch (IOException ioe) {
5472           // A failure here can cause an atomicity violation that we currently
5473           // cannot recover from since it is likely a failed HDFS operation.
5474 
5475           // TODO Need a better story for reverting partial failures due to HDFS.
5476           LOG.error("There was a partial failure due to IO when attempting to" +
5477               " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
5478           if (bulkLoadListener != null) {
5479             try {
5480               bulkLoadListener.failedBulkLoad(familyName, path);
5481             } catch (Exception ex) {
5482               LOG.error("Error while calling failedBulkLoad for family " +
5483                   Bytes.toString(familyName) + " with path " + path, ex);
5484             }
5485           }
5486           throw ioe;
5487         }
5488       }
5489 
5490       return true;
5491     } finally {
5492       if (wal != null && !storeFiles.isEmpty()) {
5493         // write a bulk load event when not all hfiles are loaded
5494         try {
5495           WALProtos.BulkLoadDescriptor loadDescriptor = ProtobufUtil.toBulkLoadDescriptor(
5496               this.getRegionInfo().getTable(),
5497               ByteStringer.wrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles, seqId);
5498           WALUtil.writeBulkLoadMarkerAndSync(wal, this.htableDescriptor, getRegionInfo(),
5499               loadDescriptor, mvcc);
5500         } catch (IOException ioe) {
5501           if (this.rsServices != null) {
5502             // Have to abort region server because some hfiles has been loaded but we can't write
5503             // the event into WAL
5504             this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
5505           }
5506         }
5507       }
5508 
5509       closeBulkRegionOperation();
5510     }
5511   }
5512 
5513   @Override
5514   public boolean equals(Object o) {
5515     return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
5516                                                 ((HRegion) o).getRegionInfo().getRegionName());
5517   }
5518 
5519   @Override
5520   public int hashCode() {
5521     return Bytes.hashCode(getRegionInfo().getRegionName());
5522   }
5523 
5524   @Override
5525   public String toString() {
5526     return getRegionInfo().getRegionNameAsString();
5527   }
5528 
5529   /**
5530    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
5531    */
5532   class RegionScannerImpl implements RegionScanner, org.apache.hadoop.hbase.ipc.RpcCallback {
5533     // Package local for testability
5534     KeyValueHeap storeHeap = null;
5535     /** Heap of key-values that are not essential for the provided filters and are thus read
5536      * on demand, if on-demand column family loading is enabled.*/
5537     KeyValueHeap joinedHeap = null;
5538     /**
5539      * If the joined heap data gathering is interrupted due to scan limits, this will
5540      * contain the row for which we are populating the values.*/
5541     protected Cell joinedContinuationRow = null;
5542     private boolean filterClosed = false;
5543 
5544     protected final int isScan;
5545     protected final byte[] stopRow;
5546     protected final HRegion region;
5547     protected final CellComparator comparator;
5548     protected boolean copyCellsFromSharedMem = false;
5549 
5550     private final long readPt;
5551     private final long maxResultSize;
5552     private final ScannerContext defaultScannerContext;
5553     private final FilterWrapper filter;
5554 
5555     @Override
5556     public HRegionInfo getRegionInfo() {
5557       return region.getRegionInfo();
5558     }
5559 
5560     public void setCopyCellsFromSharedMem(boolean copyCells) {
5561       this.copyCellsFromSharedMem = copyCells;
5562     }
5563 
5564     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region,
5565         boolean copyCellsFromSharedMem)
5566         throws IOException {
5567       this.region = region;
5568       this.maxResultSize = scan.getMaxResultSize();
5569       if (scan.hasFilter()) {
5570         this.filter = new FilterWrapper(scan.getFilter());
5571       } else {
5572         this.filter = null;
5573       }
5574       this.comparator = region.getCellCompartor();
5575       /**
5576        * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default
5577        * scanner context that can be used to enforce the batch limit in the event that a
5578        * ScannerContext is not specified during an invocation of next/nextRaw
5579        */
5580       defaultScannerContext = ScannerContext.newBuilder()
5581           .setBatchLimit(scan.getBatch()).build();
5582 
5583       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
5584         this.stopRow = null;
5585       } else {
5586         this.stopRow = scan.getStopRow();
5587       }
5588       // If we are doing a get, we want to be [startRow,endRow]. Normally
5589       // it is [startRow,endRow) and if startRow=endRow we get nothing.
5590       this.isScan = scan.isGetScan() ? 1 : 0;
5591 
5592       // synchronize on scannerReadPoints so that nobody calculates
5593       // getSmallestReadPoint, before scannerReadPoints is updated.
5594       IsolationLevel isolationLevel = scan.getIsolationLevel();
5595       synchronized(scannerReadPoints) {
5596         this.readPt = getReadpoint(isolationLevel);
5597         scannerReadPoints.put(this, this.readPt);
5598       }
5599 
5600       // Here we separate all scanners into two lists - scanner that provide data required
5601       // by the filter to operate (scanners list) and all others (joinedScanners list).
5602       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5603       List<KeyValueScanner> joinedScanners
5604         = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5605       if (additionalScanners != null) {
5606         scanners.addAll(additionalScanners);
5607       }
5608 
5609       for (Map.Entry<byte[], NavigableSet<byte[]>> entry : scan.getFamilyMap().entrySet()) {
5610         Store store = stores.get(entry.getKey());
5611         KeyValueScanner scanner;
5612         try {
5613           scanner = store.getScanner(scan, entry.getValue(), this.readPt);
5614         } catch (FileNotFoundException e) {
5615           throw handleFileNotFound(e);
5616         }
5617         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
5618           || this.filter.isFamilyEssential(entry.getKey())) {
5619           scanners.add(scanner);
5620         } else {
5621           joinedScanners.add(scanner);
5622         }
5623       }
5624       this.copyCellsFromSharedMem = copyCellsFromSharedMem;
5625       initializeKVHeap(scanners, joinedScanners, region);
5626     }
5627 
5628     protected void initializeKVHeap(List<KeyValueScanner> scanners,
5629         List<KeyValueScanner> joinedScanners, HRegion region)
5630         throws IOException {
5631       this.storeHeap = new KeyValueHeap(scanners, comparator);
5632       if (!joinedScanners.isEmpty()) {
5633         this.joinedHeap = new KeyValueHeap(joinedScanners, comparator);
5634       }
5635     }
5636 
5637     @Override
5638     public long getMaxResultSize() {
5639       return maxResultSize;
5640     }
5641 
5642     @Override
5643     public long getMvccReadPoint() {
5644       return this.readPt;
5645     }
5646 
5647     @Override
5648     public int getBatch() {
5649       return this.defaultScannerContext.getBatchLimit();
5650     }
5651 
5652     /**
5653      * Reset both the filter and the old filter.
5654      *
5655      * @throws IOException in case a filter raises an I/O exception.
5656      */
5657     protected void resetFilters() throws IOException {
5658       if (filter != null) {
5659         filter.reset();
5660       }
5661     }
5662 
5663     @Override
5664     public boolean next(List<Cell> outResults)
5665         throws IOException {
5666       // apply the batching limit by default
5667       return next(outResults, defaultScannerContext);
5668     }
5669 
5670     @Override
5671     public synchronized boolean next(List<Cell> outResults, ScannerContext scannerContext)
5672     throws IOException {
5673       if (this.filterClosed) {
5674         throw new UnknownScannerException("Scanner was closed (timed out?) " +
5675             "after we renewed it. Could be caused by a very slow scanner " +
5676             "or a lengthy garbage collection");
5677       }
5678       startRegionOperation(Operation.SCAN);
5679       readRequestsCount.increment();
5680       try {
5681         return nextRaw(outResults, scannerContext);
5682       } finally {
5683         closeRegionOperation(Operation.SCAN);
5684       }
5685     }
5686 
5687     @Override
5688     public boolean nextRaw(List<Cell> outResults) throws IOException {
5689       // Use the RegionScanner's context by default
5690       return nextRaw(outResults, defaultScannerContext);
5691     }
5692 
5693     @Override
5694     public boolean nextRaw(List<Cell> outResults, ScannerContext scannerContext)
5695         throws IOException {
5696       if (storeHeap == null) {
5697         // scanner is closed
5698         throw new UnknownScannerException("Scanner was closed");
5699       }
5700       boolean moreValues = false;
5701       try {
5702         if (outResults.isEmpty()) {
5703           // Usually outResults is empty. This is true when next is called
5704           // to handle scan or get operation.
5705           moreValues = nextInternal(outResults, scannerContext);
5706         } else {
5707           List<Cell> tmpList = new ArrayList<Cell>();
5708           moreValues = nextInternal(tmpList, scannerContext);
5709           outResults.addAll(tmpList);
5710         }
5711 
5712         // If the size limit was reached it means a partial Result is being
5713         // returned. Returning a
5714         // partial Result means that we should not reset the filters; filters
5715         // should only be reset in
5716         // between rows
5717         if (!scannerContext.partialResultFormed()) resetFilters();
5718 
5719         if (isFilterDoneInternal()) {
5720           moreValues = false;
5721         }
5722 
5723         // If copyCellsFromSharedMem = true, then we need to copy the cells. Otherwise
5724         // it is a call coming from the RsRpcServices.scan().
5725         if (copyCellsFromSharedMem && !outResults.isEmpty()) {
5726           // Do the copy of the results here.
5727           ListIterator<Cell> listItr = outResults.listIterator();
5728           Cell cell = null;
5729           while (listItr.hasNext()) {
5730             cell = listItr.next();
5731             if (cell instanceof ShareableMemory) {
5732               listItr.set(((ShareableMemory) cell).cloneToCell());
5733             }
5734           }
5735         }
5736       } finally {
5737         if (copyCellsFromSharedMem) {
5738           // In case of copyCellsFromSharedMem==true (where the CPs wrap a scanner) we return
5739           // the blocks then and there (for wrapped CPs)
5740           this.shipped();
5741         }
5742       }
5743       return moreValues;
5744     }
5745 
5746     /**
5747      * @return true if more cells exist after this batch, false if scanner is done
5748      */
5749     private boolean populateFromJoinedHeap(List<Cell> results, ScannerContext scannerContext)
5750             throws IOException {
5751       assert joinedContinuationRow != null;
5752       boolean moreValues = populateResult(results, this.joinedHeap, scannerContext,
5753           joinedContinuationRow);
5754 
5755       if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5756         // We are done with this row, reset the continuation.
5757         joinedContinuationRow = null;
5758       }
5759       // As the data is obtained from two independent heaps, we need to
5760       // ensure that result list is sorted, because Result relies on that.
5761       Collections.sort(results, comparator);
5762       return moreValues;
5763     }
5764 
5765     /**
5766      * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
5767      * reached, or remainingResultSize (if not -1) is reaced
5768      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
5769      * @param scannerContext
5770      * @param currentRowCell
5771      * @return state of last call to {@link KeyValueHeap#next()}
5772      */
5773     private boolean populateResult(List<Cell> results, KeyValueHeap heap,
5774         ScannerContext scannerContext, Cell currentRowCell) throws IOException {
5775       Cell nextKv;
5776       boolean moreCellsInRow = false;
5777       boolean tmpKeepProgress = scannerContext.getKeepProgress();
5778       // Scanning between column families and thus the scope is between cells
5779       LimitScope limitScope = LimitScope.BETWEEN_CELLS;
5780       try {
5781         do {
5782           // We want to maintain any progress that is made towards the limits while scanning across
5783           // different column families. To do this, we toggle the keep progress flag on during calls
5784           // to the StoreScanner to ensure that any progress made thus far is not wiped away.
5785           scannerContext.setKeepProgress(true);
5786           heap.next(results, scannerContext);
5787           scannerContext.setKeepProgress(tmpKeepProgress);
5788 
5789           nextKv = heap.peek();
5790           moreCellsInRow = moreCellsInRow(nextKv, currentRowCell);
5791           if (!moreCellsInRow) incrementCountOfRowsScannedMetric(scannerContext);
5792           if (scannerContext.checkBatchLimit(limitScope)) {
5793             return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues();
5794           } else if (scannerContext.checkSizeLimit(limitScope)) {
5795             ScannerContext.NextState state =
5796               moreCellsInRow? NextState.SIZE_LIMIT_REACHED_MID_ROW: NextState.SIZE_LIMIT_REACHED;
5797             return scannerContext.setScannerState(state).hasMoreValues();
5798           } else if (scannerContext.checkTimeLimit(limitScope)) {
5799             ScannerContext.NextState state =
5800               moreCellsInRow? NextState.TIME_LIMIT_REACHED_MID_ROW: NextState.TIME_LIMIT_REACHED;
5801             return scannerContext.setScannerState(state).hasMoreValues();
5802           }
5803         } while (moreCellsInRow);
5804       } catch (FileNotFoundException e) {
5805         throw handleFileNotFound(e);
5806       }
5807       return nextKv != null;
5808     }
5809 
5810     /**
5811      * Based on the nextKv in the heap, and the current row, decide whether or not there are more
5812      * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
5813      * then there are more cells to be read in the row.
5814      * @param nextKv
5815      * @param currentRowCell
5816      * @return true When there are more cells in the row to be read
5817      */
5818     private boolean moreCellsInRow(final Cell nextKv, Cell currentRowCell) {
5819       return nextKv != null && CellUtil.matchingRow(nextKv, currentRowCell);
5820     }
5821 
5822     /*
5823      * @return True if a filter rules the scanner is over, done.
5824      */
5825     @Override
5826     public synchronized boolean isFilterDone() throws IOException {
5827       return isFilterDoneInternal();
5828     }
5829 
5830     private boolean isFilterDoneInternal() throws IOException {
5831       return this.filter != null && this.filter.filterAllRemaining();
5832     }
5833 
5834     private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
5835         throws IOException {
5836       if (!results.isEmpty()) {
5837         throw new IllegalArgumentException("First parameter should be an empty list");
5838       }
5839       if (scannerContext == null) {
5840         throw new IllegalArgumentException("Scanner context cannot be null");
5841       }
5842       RpcCallContext rpcCall = RpcServer.getCurrentCall();
5843 
5844       // Save the initial progress from the Scanner context in these local variables. The progress
5845       // may need to be reset a few times if rows are being filtered out so we save the initial
5846       // progress.
5847       int initialBatchProgress = scannerContext.getBatchProgress();
5848       long initialSizeProgress = scannerContext.getSizeProgress();
5849       long initialTimeProgress = scannerContext.getTimeProgress();
5850 
5851       // The loop here is used only when at some point during the next we determine
5852       // that due to effects of filters or otherwise, we have an empty row in the result.
5853       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
5854       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
5855       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
5856       while (true) {
5857         // Starting to scan a new row. Reset the scanner progress according to whether or not
5858         // progress should be kept.
5859         if (scannerContext.getKeepProgress()) {
5860           // Progress should be kept. Reset to initial values seen at start of method invocation.
5861           scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
5862             initialTimeProgress);
5863         } else {
5864           scannerContext.clearProgress();
5865         }
5866 
5867         if (rpcCall != null) {
5868           // If a user specifies a too-restrictive or too-slow scanner, the
5869           // client might time out and disconnect while the server side
5870           // is still processing the request. We should abort aggressively
5871           // in that case.
5872           long afterTime = rpcCall.disconnectSince();
5873           if (afterTime >= 0) {
5874             throw new CallerDisconnectedException(
5875                 "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " +
5876                     this + " after " + afterTime + " ms, since " +
5877                     "caller disconnected");
5878           }
5879         }
5880 
5881         // Let's see what we have in the storeHeap.
5882         Cell current = this.storeHeap.peek();
5883 
5884         boolean stopRow = isStopRow(current);
5885         // When has filter row is true it means that the all the cells for a particular row must be
5886         // read before a filtering decision can be made. This means that filters where hasFilterRow
5887         // run the risk of encountering out of memory errors in the case that they are applied to a
5888         // table that has very large rows.
5889         boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();
5890 
5891         // If filter#hasFilterRow is true, partial results are not allowed since allowing them
5892         // would prevent the filters from being evaluated. Thus, if it is true, change the
5893         // scope of any limits that could potentially create partial results to
5894         // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row
5895         if (hasFilterRow) {
5896           if (LOG.isTraceEnabled()) {
5897             LOG.trace("filter#hasFilterRow is true which prevents partial results from being "
5898                 + " formed. Changing scope of limits that may create partials");
5899           }
5900           scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS);
5901           scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS);
5902         }
5903 
5904         // Check if we were getting data from the joinedHeap and hit the limit.
5905         // If not, then it's main path - getting results from storeHeap.
5906         if (joinedContinuationRow == null) {
5907           // First, check if we are at a stop row. If so, there are no more results.
5908           if (stopRow) {
5909             if (hasFilterRow) {
5910               filter.filterRowCells(results);
5911             }
5912             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5913           }
5914 
5915           // Check if rowkey filter wants to exclude this row. If so, loop to next.
5916           // Technically, if we hit limits before on this row, we don't need this call.
5917           if (filterRowKey(current)) {
5918             incrementCountOfRowsFilteredMetric(scannerContext);
5919             // Typically the count of rows scanned is incremented inside #populateResult. However,
5920             // here we are filtering a row based purely on its row key, preventing us from calling
5921             // #populateResult. Thus, perform the necessary increment here to rows scanned metric
5922             incrementCountOfRowsScannedMetric(scannerContext);
5923             boolean moreRows = nextRow(scannerContext, current);
5924             if (!moreRows) {
5925               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5926             }
5927             results.clear();
5928             continue;
5929           }
5930 
5931           // Ok, we are good, let's try to get some results from the main heap.
5932           populateResult(results, this.storeHeap, scannerContext, current);
5933 
5934           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5935             if (hasFilterRow) {
5936               throw new IncompatibleFilterException(
5937                   "Filter whose hasFilterRow() returns true is incompatible with scans that must "
5938                       + " stop mid-row because of a limit. ScannerContext:" + scannerContext);
5939             }
5940             return true;
5941           }
5942 
5943           Cell nextKv = this.storeHeap.peek();
5944           stopRow = nextKv == null || isStopRow(nextKv);
5945           // save that the row was empty before filters applied to it.
5946           final boolean isEmptyRow = results.isEmpty();
5947 
5948           // We have the part of the row necessary for filtering (all of it, usually).
5949           // First filter with the filterRow(List).
5950           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
5951           if (hasFilterRow) {
5952             ret = filter.filterRowCellsWithRet(results);
5953 
5954             // We don't know how the results have changed after being filtered. Must set progress
5955             // according to contents of results now. However, a change in the results should not
5956             // affect the time progress. Thus preserve whatever time progress has been made
5957             long timeProgress = scannerContext.getTimeProgress();
5958             if (scannerContext.getKeepProgress()) {
5959               scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
5960                 initialTimeProgress);
5961             } else {
5962               scannerContext.clearProgress();
5963             }
5964             scannerContext.setTimeProgress(timeProgress);
5965             scannerContext.incrementBatchProgress(results.size());
5966             for (Cell cell : results) {
5967               scannerContext.incrementSizeProgress(CellUtil.estimatedHeapSizeOf(cell));
5968             }
5969           }
5970 
5971           if (isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE || filterRow()) {
5972             incrementCountOfRowsFilteredMetric(scannerContext);
5973             results.clear();
5974             boolean moreRows = nextRow(scannerContext, current);
5975             if (!moreRows) {
5976               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5977             }
5978 
5979             // This row was totally filtered out, if this is NOT the last row,
5980             // we should continue on. Otherwise, nothing else to do.
5981             if (!stopRow) continue;
5982             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5983           }
5984 
5985           // Ok, we are done with storeHeap for this row.
5986           // Now we may need to fetch additional, non-essential data into row.
5987           // These values are not needed for filter to work, so we postpone their
5988           // fetch to (possibly) reduce amount of data loads from disk.
5989           if (this.joinedHeap != null) {
5990             boolean mayHaveData = joinedHeapMayHaveData(current);
5991             if (mayHaveData) {
5992               joinedContinuationRow = current;
5993               populateFromJoinedHeap(results, scannerContext);
5994 
5995               if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5996                 return true;
5997               }
5998             }
5999           }
6000         } else {
6001           // Populating from the joined heap was stopped by limits, populate some more.
6002           populateFromJoinedHeap(results, scannerContext);
6003           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6004             return true;
6005           }
6006         }
6007         // We may have just called populateFromJoinedMap and hit the limits. If that is
6008         // the case, we need to call it again on the next next() invocation.
6009         if (joinedContinuationRow != null) {
6010           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6011         }
6012 
6013         // Finally, we are done with both joinedHeap and storeHeap.
6014         // Double check to prevent empty rows from appearing in result. It could be
6015         // the case when SingleColumnValueExcludeFilter is used.
6016         if (results.isEmpty()) {
6017           incrementCountOfRowsFilteredMetric(scannerContext);
6018           boolean moreRows = nextRow(scannerContext, current);
6019           if (!moreRows) {
6020             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6021           }
6022           if (!stopRow) continue;
6023         }
6024 
6025         if (stopRow) {
6026           return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6027         } else {
6028           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6029         }
6030       }
6031     }
6032 
6033     protected void incrementCountOfRowsFilteredMetric(ScannerContext scannerContext) {
6034       filteredReadRequestsCount.increment();
6035 
6036       if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
6037 
6038       scannerContext.getMetrics().countOfRowsFiltered.incrementAndGet();
6039     }
6040 
6041     protected void incrementCountOfRowsScannedMetric(ScannerContext scannerContext) {
6042       if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
6043 
6044       scannerContext.getMetrics().countOfRowsScanned.incrementAndGet();
6045     }
6046 
6047     /**
6048      * @param currentRowCell
6049      * @return true when the joined heap may have data for the current row
6050      * @throws IOException
6051      */
6052     private boolean joinedHeapMayHaveData(Cell currentRowCell)
6053         throws IOException {
6054       Cell nextJoinedKv = joinedHeap.peek();
6055       boolean matchCurrentRow =
6056           nextJoinedKv != null && CellUtil.matchingRow(nextJoinedKv, currentRowCell);
6057       boolean matchAfterSeek = false;
6058 
6059       // If the next value in the joined heap does not match the current row, try to seek to the
6060       // correct row
6061       if (!matchCurrentRow) {
6062         Cell firstOnCurrentRow = CellUtil.createFirstOnRow(currentRowCell);
6063         boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true);
6064         matchAfterSeek =
6065             seekSuccessful && joinedHeap.peek() != null
6066                 && CellUtil.matchingRow(joinedHeap.peek(), currentRowCell);
6067       }
6068 
6069       return matchCurrentRow || matchAfterSeek;
6070     }
6071 
6072     /**
6073      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
6074      * both filterRow & filterRow({@code List<KeyValue> kvs}) functions. While 0.94 code or older,
6075      * it may not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only
6076      * returns true when filterRow({@code List<KeyValue> kvs}) is overridden not the filterRow().
6077      * Therefore, the filterRow() will be skipped.
6078      */
6079     private boolean filterRow() throws IOException {
6080       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
6081       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
6082       return filter != null && (!filter.hasFilterRow())
6083           && filter.filterRow();
6084     }
6085 
6086     private boolean filterRowKey(Cell current) throws IOException {
6087       return filter != null && filter.filterRowKey(current);
6088     }
6089 
6090     protected boolean nextRow(ScannerContext scannerContext, Cell curRowCell) throws IOException {
6091       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
6092       Cell next;
6093       while ((next = this.storeHeap.peek()) != null &&
6094              CellUtil.matchingRow(next, curRowCell)) {
6095         this.storeHeap.next(MOCKED_LIST);
6096       }
6097       resetFilters();
6098 
6099       // Calling the hook in CP which allows it to do a fast forward
6100       return this.region.getCoprocessorHost() == null
6101           || this.region.getCoprocessorHost()
6102               .postScannerFilterRow(this, curRowCell);
6103     }
6104 
6105     protected boolean isStopRow(Cell currentRowCell) {
6106       return currentRowCell == null
6107           || (stopRow != null && comparator.compareRows(currentRowCell, stopRow, 0, stopRow.length) >= isScan);
6108     }
6109 
6110     @Override
6111     public synchronized void close() {
6112       if (storeHeap != null) {
6113         storeHeap.close();
6114         storeHeap = null;
6115       }
6116       if (joinedHeap != null) {
6117         joinedHeap.close();
6118         joinedHeap = null;
6119       }
6120       // no need to synchronize here.
6121       scannerReadPoints.remove(this);
6122       this.filterClosed = true;
6123     }
6124 
6125     KeyValueHeap getStoreHeapForTesting() {
6126       return storeHeap;
6127     }
6128 
6129     @Override
6130     public synchronized boolean reseek(byte[] row) throws IOException {
6131       if (row == null) {
6132         throw new IllegalArgumentException("Row cannot be null.");
6133       }
6134       boolean result = false;
6135       startRegionOperation();
6136       KeyValue kv = KeyValueUtil.createFirstOnRow(row);
6137       try {
6138         // use request seek to make use of the lazy seek option. See HBASE-5520
6139         result = this.storeHeap.requestSeek(kv, true, true);
6140         if (this.joinedHeap != null) {
6141           result = this.joinedHeap.requestSeek(kv, true, true) || result;
6142         }
6143       } catch (FileNotFoundException e) {
6144         throw handleFileNotFound(e);
6145       } finally {
6146         closeRegionOperation();
6147       }
6148       return result;
6149     }
6150 
6151     private IOException handleFileNotFound(FileNotFoundException fnfe) throws IOException {
6152       // tries to refresh the store files, otherwise shutdown the RS.
6153       // TODO: add support for abort() of a single region and trigger reassignment.
6154       try {
6155         region.refreshStoreFiles(true);
6156         return new IOException("unable to read store file");
6157       } catch (IOException e) {
6158         String msg = "a store file got lost: " + fnfe.getMessage();
6159         LOG.error("unable to refresh store files", e);
6160         abortRegionServer(msg);
6161         return new NotServingRegionException(
6162           getRegionInfo().getRegionNameAsString() + " is closing");
6163       }
6164     }
6165 
6166     private void abortRegionServer(String msg) throws IOException {
6167       if (rsServices instanceof HRegionServer) {
6168         ((HRegionServer)rsServices).abort(msg);
6169       }
6170       throw new UnsupportedOperationException("not able to abort RS after: " + msg);
6171     }
6172 
6173     @Override
6174     public void shipped() throws IOException {
6175       if (storeHeap != null) {
6176         storeHeap.shipped();
6177       }
6178       if (joinedHeap != null) {
6179         joinedHeap.shipped();
6180       }
6181     }
6182 
6183     @Override
6184     public void run() throws IOException {
6185       // This is the RPC callback method executed. We do the close in of the scanner in this
6186       // callback
6187       this.close();
6188     }
6189   }
6190 
6191   // Utility methods
6192   /**
6193    * A utility method to create new instances of HRegion based on the
6194    * {@link HConstants#REGION_IMPL} configuration property.
6195    * @param tableDir qualified path of directory where region should be located,
6196    * usually the table directory.
6197    * @param wal The WAL is the outbound log for any updates to the HRegion
6198    * The wal file is a logfile from the previous execution that's
6199    * custom-computed for this HRegion. The HRegionServer computes and sorts the
6200    * appropriate wal info for this HRegion. If there is a previous file
6201    * (implying that the HRegion has been written-to before), then read it from
6202    * the supplied path.
6203    * @param fs is the filesystem.
6204    * @param conf is global configuration settings.
6205    * @param regionInfo - HRegionInfo that describes the region
6206    * is new), then read them from the supplied path.
6207    * @param htd the table descriptor
6208    * @return the new instance
6209    */
6210   static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
6211       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
6212       RegionServerServices rsServices) {
6213     try {
6214       @SuppressWarnings("unchecked")
6215       Class<? extends HRegion> regionClass =
6216           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
6217 
6218       Constructor<? extends HRegion> c =
6219           regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
6220               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
6221               RegionServerServices.class);
6222 
6223       return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
6224     } catch (Throwable e) {
6225       // todo: what should I throw here?
6226       throw new IllegalStateException("Could not instantiate a region instance.", e);
6227     }
6228   }
6229 
6230   /**
6231    * Convenience method creating new HRegions. Used by createTable.
6232    *
6233    * @param info Info for region to create.
6234    * @param rootDir Root directory for HBase instance
6235    * @param wal shared WAL
6236    * @param initialize - true to initialize the region
6237    * @return new HRegion
6238    * @throws IOException
6239    */
6240   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6241         final Configuration conf, final HTableDescriptor hTableDescriptor,
6242         final WAL wal, final boolean initialize)
6243   throws IOException {
6244     LOG.info("creating HRegion " + info.getTable().getNameAsString()
6245         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
6246         " Table name == " + info.getTable().getNameAsString());
6247     FileSystem fs = FileSystem.get(conf);
6248     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6249     HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
6250     HRegion region = HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, null);
6251     if (initialize) region.initialize(null);
6252     return region;
6253   }
6254 
6255   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6256                                       final Configuration conf,
6257                                       final HTableDescriptor hTableDescriptor,
6258                                       final WAL wal)
6259     throws IOException {
6260     return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
6261   }
6262 
6263 
6264   /**
6265    * Open a Region.
6266    * @param info Info for region to be opened.
6267    * @param wal WAL for region to use. This method will call
6268    * WAL#setSequenceNumber(long) passing the result of the call to
6269    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6270    * up.  HRegionStore does this every time it opens a new region.
6271    * @return new HRegion
6272    *
6273    * @throws IOException
6274    */
6275   public static HRegion openHRegion(final HRegionInfo info,
6276       final HTableDescriptor htd, final WAL wal,
6277       final Configuration conf)
6278   throws IOException {
6279     return openHRegion(info, htd, wal, conf, null, null);
6280   }
6281 
6282   /**
6283    * Open a Region.
6284    * @param info Info for region to be opened
6285    * @param htd the table descriptor
6286    * @param wal WAL for region to use. This method will call
6287    * WAL#setSequenceNumber(long) passing the result of the call to
6288    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6289    * up.  HRegionStore does this every time it opens a new region.
6290    * @param conf The Configuration object to use.
6291    * @param rsServices An interface we can request flushes against.
6292    * @param reporter An interface we can report progress against.
6293    * @return new HRegion
6294    *
6295    * @throws IOException
6296    */
6297   public static HRegion openHRegion(final HRegionInfo info,
6298     final HTableDescriptor htd, final WAL wal, final Configuration conf,
6299     final RegionServerServices rsServices,
6300     final CancelableProgressable reporter)
6301   throws IOException {
6302     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
6303   }
6304 
6305   /**
6306    * Open a Region.
6307    * @param rootDir Root directory for HBase instance
6308    * @param info Info for region to be opened.
6309    * @param htd the table descriptor
6310    * @param wal WAL for region to use. This method will call
6311    * WAL#setSequenceNumber(long) passing the result of the call to
6312    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6313    * up.  HRegionStore does this every time it opens a new region.
6314    * @param conf The Configuration object to use.
6315    * @return new HRegion
6316    * @throws IOException
6317    */
6318   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
6319       final HTableDescriptor htd, final WAL wal, final Configuration conf)
6320   throws IOException {
6321     return openHRegion(rootDir, info, htd, wal, conf, null, null);
6322   }
6323 
6324   /**
6325    * Open a Region.
6326    * @param rootDir Root directory for HBase instance
6327    * @param info Info for region to be opened.
6328    * @param htd the table descriptor
6329    * @param wal WAL for region to use. This method will call
6330    * WAL#setSequenceNumber(long) passing the result of the call to
6331    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6332    * up.  HRegionStore does this every time it opens a new region.
6333    * @param conf The Configuration object to use.
6334    * @param rsServices An interface we can request flushes against.
6335    * @param reporter An interface we can report progress against.
6336    * @return new HRegion
6337    * @throws IOException
6338    */
6339   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
6340       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6341       final RegionServerServices rsServices,
6342       final CancelableProgressable reporter)
6343   throws IOException {
6344     FileSystem fs = null;
6345     if (rsServices != null) {
6346       fs = rsServices.getFileSystem();
6347     }
6348     if (fs == null) {
6349       fs = FileSystem.get(conf);
6350     }
6351     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
6352   }
6353 
6354   /**
6355    * Open a Region.
6356    * @param conf The Configuration object to use.
6357    * @param fs Filesystem to use
6358    * @param rootDir Root directory for HBase instance
6359    * @param info Info for region to be opened.
6360    * @param htd the table descriptor
6361    * @param wal WAL for region to use. This method will call
6362    * WAL#setSequenceNumber(long) passing the result of the call to
6363    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6364    * up.  HRegionStore does this every time it opens a new region.
6365    * @return new HRegion
6366    * @throws IOException
6367    */
6368   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6369       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal)
6370       throws IOException {
6371     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
6372   }
6373 
6374   /**
6375    * Open a Region.
6376    * @param conf The Configuration object to use.
6377    * @param fs Filesystem to use
6378    * @param rootDir Root directory for HBase instance
6379    * @param info Info for region to be opened.
6380    * @param htd the table descriptor
6381    * @param wal WAL for region to use. This method will call
6382    * WAL#setSequenceNumber(long) passing the result of the call to
6383    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6384    * up.  HRegionStore does this every time it opens a new region.
6385    * @param rsServices An interface we can request flushes against.
6386    * @param reporter An interface we can report progress against.
6387    * @return new HRegion
6388    * @throws IOException
6389    */
6390   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6391       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal,
6392       final RegionServerServices rsServices, final CancelableProgressable reporter)
6393       throws IOException {
6394     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6395     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
6396   }
6397 
6398   /**
6399    * Open a Region.
6400    * @param conf The Configuration object to use.
6401    * @param fs Filesystem to use
6402    * @param rootDir Root directory for HBase instance
6403    * @param info Info for region to be opened.
6404    * @param htd the table descriptor
6405    * @param wal WAL for region to use. This method will call
6406    * WAL#setSequenceNumber(long) passing the result of the call to
6407    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6408    * up.  HRegionStore does this every time it opens a new region.
6409    * @param rsServices An interface we can request flushes against.
6410    * @param reporter An interface we can report progress against.
6411    * @return new HRegion
6412    * @throws IOException
6413    */
6414   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6415       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd,
6416       final WAL wal, final RegionServerServices rsServices,
6417       final CancelableProgressable reporter)
6418       throws IOException {
6419     if (info == null) throw new NullPointerException("Passed region info is null");
6420     if (LOG.isDebugEnabled()) {
6421       LOG.debug("Opening region: " + info);
6422     }
6423     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6424     return r.openHRegion(reporter);
6425   }
6426 
6427 
6428   /**
6429    * Useful when reopening a closed region (normally for unit tests)
6430    * @param other original object
6431    * @param reporter An interface we can report progress against.
6432    * @return new HRegion
6433    * @throws IOException
6434    */
6435   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
6436       throws IOException {
6437     HRegionFileSystem regionFs = other.getRegionFileSystem();
6438     HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
6439         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
6440     return r.openHRegion(reporter);
6441   }
6442 
6443   public static Region openHRegion(final Region other, final CancelableProgressable reporter)
6444         throws IOException {
6445     return openHRegion((HRegion)other, reporter);
6446   }
6447 
6448   /**
6449    * Open HRegion.
6450    * Calls initialize and sets sequenceId.
6451    * @return Returns <code>this</code>
6452    * @throws IOException
6453    */
6454   protected HRegion openHRegion(final CancelableProgressable reporter)
6455   throws IOException {
6456     // Refuse to open the region if we are missing local compression support
6457     checkCompressionCodecs();
6458     // Refuse to open the region if encryption configuration is incorrect or
6459     // codec support is missing
6460     checkEncryption();
6461     // Refuse to open the region if a required class cannot be loaded
6462     checkClassLoading();
6463     this.openSeqNum = initialize(reporter);
6464     this.mvcc.advanceTo(openSeqNum);
6465     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
6466         && !recovering) {
6467       // Only write the region open event marker to WAL if (1) we are not read-only
6468       // (2) dist log replay is off or we are not recovering. In case region is
6469       // recovering, the open event will be written at setRecovering(false)
6470       writeRegionOpenMarker(wal, openSeqNum);
6471     }
6472     return this;
6473   }
6474 
6475   public static void warmupHRegion(final HRegionInfo info,
6476       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6477       final RegionServerServices rsServices,
6478       final CancelableProgressable reporter)
6479       throws IOException {
6480 
6481     if (info == null) throw new NullPointerException("Passed region info is null");
6482 
6483     if (LOG.isDebugEnabled()) {
6484       LOG.debug("HRegion.Warming up region: " + info);
6485     }
6486 
6487     Path rootDir = FSUtils.getRootDir(conf);
6488     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6489 
6490     FileSystem fs = null;
6491     if (rsServices != null) {
6492       fs = rsServices.getFileSystem();
6493     }
6494     if (fs == null) {
6495       fs = FileSystem.get(conf);
6496     }
6497 
6498     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6499     r.initializeWarmup(reporter);
6500     r.close();
6501   }
6502 
6503 
6504   private void checkCompressionCodecs() throws IOException {
6505     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6506       CompressionTest.testCompression(fam.getCompressionType());
6507       CompressionTest.testCompression(fam.getCompactionCompressionType());
6508     }
6509   }
6510 
6511   private void checkEncryption() throws IOException {
6512     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6513       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
6514     }
6515   }
6516 
6517   private void checkClassLoading() throws IOException {
6518     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
6519     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
6520   }
6521 
6522   /**
6523    * Create a daughter region from given a temp directory with the region data.
6524    * @param hri Spec. for daughter region to open.
6525    * @throws IOException
6526    */
6527   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
6528     // Move the files from the temporary .splits to the final /table/region directory
6529     fs.commitDaughterRegion(hri);
6530 
6531     // Create the daughter HRegion instance
6532     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(), fs.getFileSystem(),
6533         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
6534     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
6535     r.filteredReadRequestsCount.set(this.getFilteredReadRequestsCount() / 2);
6536     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
6537     return r;
6538   }
6539 
6540   /**
6541    * Create a merged region given a temp directory with the region data.
6542    * @param region_b another merging region
6543    * @return merged HRegion
6544    * @throws IOException
6545    */
6546   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
6547       final HRegion region_b) throws IOException {
6548     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(),
6549         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
6550         this.getTableDesc(), this.rsServices);
6551     r.readRequestsCount.set(this.getReadRequestsCount()
6552         + region_b.getReadRequestsCount());
6553     r.filteredReadRequestsCount.set(this.getFilteredReadRequestsCount()
6554       + region_b.getFilteredReadRequestsCount());
6555     r.writeRequestsCount.set(this.getWriteRequestsCount()
6556 
6557         + region_b.getWriteRequestsCount());
6558     this.fs.commitMergedRegion(mergedRegionInfo);
6559     return r;
6560   }
6561 
6562   /**
6563    * Inserts a new region's meta information into the passed
6564    * <code>meta</code> region. Used by the HMaster bootstrap code adding
6565    * new table to hbase:meta table.
6566    *
6567    * @param meta hbase:meta HRegion to be updated
6568    * @param r HRegion to add to <code>meta</code>
6569    *
6570    * @throws IOException
6571    */
6572   // TODO remove since only test and merge use this
6573   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
6574     meta.checkResources();
6575     // The row key is the region name
6576     byte[] row = r.getRegionInfo().getRegionName();
6577     final long now = EnvironmentEdgeManager.currentTime();
6578     final List<Cell> cells = new ArrayList<Cell>(2);
6579     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6580       HConstants.REGIONINFO_QUALIFIER, now,
6581       r.getRegionInfo().toByteArray()));
6582     // Set into the root table the version of the meta table.
6583     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6584       HConstants.META_VERSION_QUALIFIER, now,
6585       Bytes.toBytes(HConstants.META_VERSION)));
6586     meta.put(row, HConstants.CATALOG_FAMILY, cells);
6587   }
6588 
6589   /**
6590    * Computes the Path of the HRegion
6591    *
6592    * @param tabledir qualified path for table
6593    * @param name ENCODED region name
6594    * @return Path of HRegion directory
6595    * @deprecated For tests only; to be removed.
6596    */
6597   @Deprecated
6598   public static Path getRegionDir(final Path tabledir, final String name) {
6599     return new Path(tabledir, name);
6600   }
6601 
6602   /**
6603    * Computes the Path of the HRegion
6604    *
6605    * @param rootdir qualified path of HBase root directory
6606    * @param info HRegionInfo for the region
6607    * @return qualified path of region directory
6608    * @deprecated For tests only; to be removed.
6609    */
6610   @Deprecated
6611   @VisibleForTesting
6612   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
6613     return new Path(
6614       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
6615   }
6616 
6617   /**
6618    * Determines if the specified row is within the row range specified by the
6619    * specified HRegionInfo
6620    *
6621    * @param info HRegionInfo that specifies the row range
6622    * @param row row to be checked
6623    * @return true if the row is within the range specified by the HRegionInfo
6624    */
6625   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
6626     return ((info.getStartKey().length == 0) ||
6627         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
6628         ((info.getEndKey().length == 0) ||
6629             (Bytes.compareTo(info.getEndKey(), row) > 0));
6630   }
6631 
6632   public static boolean rowIsInRange(HRegionInfo info, final byte [] row, final int offset,
6633       final short length) {
6634     return ((info.getStartKey().length == 0) ||
6635         (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length,
6636           row, offset, length) <= 0)) &&
6637         ((info.getEndKey().length == 0) ||
6638           (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0));
6639   }
6640 
6641   /**
6642    * Merge two HRegions.  The regions must be adjacent and must not overlap.
6643    *
6644    * @return new merged HRegion
6645    * @throws IOException
6646    */
6647   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
6648   throws IOException {
6649     HRegion a = srcA;
6650     HRegion b = srcB;
6651 
6652     // Make sure that srcA comes first; important for key-ordering during
6653     // write of the merged file.
6654     if (srcA.getRegionInfo().getStartKey() == null) {
6655       if (srcB.getRegionInfo().getStartKey() == null) {
6656         throw new IOException("Cannot merge two regions with null start key");
6657       }
6658       // A's start key is null but B's isn't. Assume A comes before B
6659     } else if ((srcB.getRegionInfo().getStartKey() == null) ||
6660       (Bytes.compareTo(srcA.getRegionInfo().getStartKey(),
6661         srcB.getRegionInfo().getStartKey()) > 0)) {
6662       a = srcB;
6663       b = srcA;
6664     }
6665 
6666     if (!(Bytes.compareTo(a.getRegionInfo().getEndKey(),
6667         b.getRegionInfo().getStartKey()) == 0)) {
6668       throw new IOException("Cannot merge non-adjacent regions");
6669     }
6670     return merge(a, b);
6671   }
6672 
6673   /**
6674    * Merge two regions whether they are adjacent or not.
6675    *
6676    * @param a region a
6677    * @param b region b
6678    * @return new merged region
6679    * @throws IOException
6680    */
6681   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
6682     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
6683       throw new IOException("Regions do not belong to the same table");
6684     }
6685 
6686     FileSystem fs = a.getRegionFileSystem().getFileSystem();
6687     // Make sure each region's cache is empty
6688     a.flush(true);
6689     b.flush(true);
6690 
6691     // Compact each region so we only have one store file per family
6692     a.compact(true);
6693     if (LOG.isDebugEnabled()) {
6694       LOG.debug("Files for region: " + a);
6695       a.getRegionFileSystem().logFileSystemState(LOG);
6696     }
6697     b.compact(true);
6698     if (LOG.isDebugEnabled()) {
6699       LOG.debug("Files for region: " + b);
6700       b.getRegionFileSystem().logFileSystemState(LOG);
6701     }
6702 
6703     RegionMergeTransactionImpl rmt = new RegionMergeTransactionImpl(a, b, true);
6704     if (!rmt.prepare(null)) {
6705       throw new IOException("Unable to merge regions " + a + " and " + b);
6706     }
6707     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
6708     LOG.info("starting merge of regions: " + a + " and " + b
6709         + " into new region " + mergedRegionInfo.getRegionNameAsString()
6710         + " with start key <"
6711         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
6712         + "> and end key <"
6713         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
6714     HRegion dstRegion;
6715     try {
6716       dstRegion = (HRegion)rmt.execute(null, null);
6717     } catch (IOException ioe) {
6718       rmt.rollback(null, null);
6719       throw new IOException("Failed merging region " + a + " and " + b
6720           + ", and successfully rolled back");
6721     }
6722     dstRegion.compact(true);
6723 
6724     if (LOG.isDebugEnabled()) {
6725       LOG.debug("Files for new region");
6726       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
6727     }
6728 
6729     // clear the compacted files if any
6730     for (Store s : dstRegion.getStores()) {
6731       s.closeAndArchiveCompactedFiles();
6732     }
6733     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
6734       throw new IOException("Merged region " + dstRegion
6735           + " still has references after the compaction, is compaction canceled?");
6736     }
6737 
6738     // Archiving the 'A' region
6739     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
6740     // Archiving the 'B' region
6741     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
6742 
6743     LOG.info("merge completed. New region is " + dstRegion);
6744     return dstRegion;
6745   }
6746 
6747   @Override
6748   public Result get(final Get get) throws IOException {
6749     prepareGet(get);
6750     List<Cell> results = get(get, true);
6751     boolean stale = this.getRegionInfo().getReplicaId() != 0;
6752     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
6753   }
6754 
6755    void prepareGet(final Get get) throws IOException, NoSuchColumnFamilyException {
6756     checkRow(get.getRow(), "Get");
6757     // Verify families are all valid
6758     if (get.hasFamilies()) {
6759       for (byte [] family: get.familySet()) {
6760         checkFamily(family);
6761       }
6762     } else { // Adding all families to scanner
6763       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
6764         get.addFamily(family);
6765       }
6766     }
6767   }
6768 
6769   @Override
6770   public List<Cell> get(Get get, boolean withCoprocessor) throws IOException {
6771 
6772     List<Cell> results = new ArrayList<Cell>();
6773 
6774     // pre-get CP hook
6775     if (withCoprocessor && (coprocessorHost != null)) {
6776       if (coprocessorHost.preGet(get, results)) {
6777         return results;
6778       }
6779     }
6780 
6781     Scan scan = new Scan(get);
6782 
6783     RegionScanner scanner = null;
6784     try {
6785       scanner = getScanner(scan);
6786       scanner.next(results);
6787     } finally {
6788       if (scanner != null)
6789         scanner.close();
6790     }
6791 
6792     // post-get CP hook
6793     if (withCoprocessor && (coprocessorHost != null)) {
6794       coprocessorHost.postGet(get, results);
6795     }
6796 
6797     metricsUpdateForGet(results);
6798 
6799     return results;
6800   }
6801 
6802   void metricsUpdateForGet(List<Cell> results) {
6803     if (this.metricsRegion != null) {
6804       long totalSize = 0L;
6805       for (Cell cell : results) {
6806         // This should give an estimate of the cell in the result. Why do we need
6807         // to know the serialization of how the codec works with it??
6808         totalSize += CellUtil.estimatedSerializedSizeOf(cell);
6809       }
6810       this.metricsRegion.updateGet(totalSize);
6811     }
6812   }
6813 
6814   @Override
6815   public void mutateRow(RowMutations rm) throws IOException {
6816     // Don't need nonces here - RowMutations only supports puts and deletes
6817     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
6818   }
6819 
6820   /**
6821    * Perform atomic mutations within the region w/o nonces.
6822    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
6823    */
6824   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6825       Collection<byte[]> rowsToLock) throws IOException {
6826     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
6827   }
6828 
6829   /**
6830    * Perform atomic mutations within the region.
6831    * @param mutations The list of mutations to perform.
6832    * <code>mutations</code> can contain operations for multiple rows.
6833    * Caller has to ensure that all rows are contained in this region.
6834    * @param rowsToLock Rows to lock
6835    * @param nonceGroup Optional nonce group of the operation (client Id)
6836    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6837    * If multiple rows are locked care should be taken that
6838    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
6839    * @throws IOException
6840    */
6841   @Override
6842   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6843       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
6844     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
6845     processRowsWithLocks(proc, -1, nonceGroup, nonce);
6846   }
6847 
6848   /**
6849    * @return the current load statistics for the the region
6850    */
6851   public ClientProtos.RegionLoadStats getRegionStats() {
6852     if (!regionStatsEnabled) {
6853       return null;
6854     }
6855     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
6856     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
6857         .memstoreFlushSize)));
6858     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
6859     stats.setCompactionPressure((int)rsServices.getCompactionPressure()*100 > 100 ? 100 :
6860                 (int)rsServices.getCompactionPressure()*100);
6861     return stats.build();
6862   }
6863 
6864   @Override
6865   public void processRowsWithLocks(RowProcessor<?,?> processor) throws IOException {
6866     processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE,
6867       HConstants.NO_NONCE);
6868   }
6869 
6870   @Override
6871   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
6872       throws IOException {
6873     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
6874   }
6875 
6876   @Override
6877   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
6878       long nonceGroup, long nonce) throws IOException {
6879 
6880     for (byte[] row : processor.getRowsToLock()) {
6881       checkRow(row, "processRowsWithLocks");
6882     }
6883     if (!processor.readOnly()) {
6884       checkReadOnly();
6885     }
6886     checkResources();
6887 
6888     startRegionOperation();
6889     WALEdit walEdit = new WALEdit();
6890 
6891     // 1. Run pre-process hook
6892     try {
6893       processor.preProcess(this, walEdit);
6894     } catch (IOException e) {
6895       closeRegionOperation();
6896       throw e;
6897     }
6898     // Short circuit the read only case
6899     if (processor.readOnly()) {
6900       try {
6901         long now = EnvironmentEdgeManager.currentTime();
6902         doProcessRowWithTimeout(
6903             processor, now, this, null, null, timeout);
6904         processor.postProcess(this, walEdit, true);
6905       } finally {
6906         closeRegionOperation();
6907       }
6908       return;
6909     }
6910 
6911     MultiVersionConcurrencyControl.WriteEntry writeEntry = null;
6912     boolean locked;
6913     boolean walSyncSuccessful = false;
6914     List<RowLock> acquiredRowLocks;
6915     long addedSize = 0;
6916     List<Mutation> mutations = new ArrayList<Mutation>();
6917     Collection<byte[]> rowsToLock = processor.getRowsToLock();
6918     long mvccNum = 0;
6919     WALKey walKey = null;
6920     try {
6921       // 2. Acquire the row lock(s)
6922       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
6923       for (byte[] row : rowsToLock) {
6924         // Attempt to lock all involved rows, throw if any lock times out
6925         // use a writer lock for mixed reads and writes
6926         acquiredRowLocks.add(getRowLock(row));
6927       }
6928       // 3. Region lock
6929       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
6930       locked = true;
6931 
6932       long now = EnvironmentEdgeManager.currentTime();
6933       try {
6934         // 4. Let the processor scan the rows, generate mutations and add
6935         //    waledits
6936         doProcessRowWithTimeout(
6937             processor, now, this, mutations, walEdit, timeout);
6938 
6939         if (!mutations.isEmpty()) {
6940 
6941           // 5. Call the preBatchMutate hook
6942           processor.preBatchMutate(this, walEdit);
6943 
6944           long txid = 0;
6945           // 6. Append no sync
6946           if (!walEdit.isEmpty()) {
6947             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
6948             walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
6949               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
6950               processor.getClusterIds(), nonceGroup, nonce, mvcc);
6951             txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
6952                 walKey, walEdit, false);
6953           }
6954           if(walKey == null){
6955             // since we use wal sequence Id as mvcc, for SKIP_WAL changes we need a "faked" WALEdit
6956             // to get a sequence id assigned which is done by FSWALEntry#stampRegionSequenceId
6957             walKey = this.appendEmptyEdit(this.wal);
6958           }
6959 
6960           // 7. Start mvcc transaction
6961           writeEntry = walKey.getWriteEntry();
6962           mvccNum = walKey.getSequenceId();
6963 
6964 
6965 
6966           // 8. Apply to memstore
6967           for (Mutation m : mutations) {
6968             // Handle any tag based cell features
6969             rewriteCellTags(m.getFamilyCellMap(), m);
6970 
6971             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
6972               Cell cell = cellScanner.current();
6973               CellUtil.setSequenceId(cell, mvccNum);
6974               Store store = getStore(cell);
6975               if (store == null) {
6976                 checkFamily(CellUtil.cloneFamily(cell));
6977                 // unreachable
6978               }
6979               addedSize += store.add(cell);
6980             }
6981           }
6982 
6983           // 9. Release region lock
6984           if (locked) {
6985             this.updatesLock.readLock().unlock();
6986             locked = false;
6987           }
6988 
6989           // 10. Release row lock(s)
6990           releaseRowLocks(acquiredRowLocks);
6991 
6992           // 11. Sync edit log
6993           if (txid != 0) {
6994             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
6995           }
6996           walSyncSuccessful = true;
6997           // 12. call postBatchMutate hook
6998           processor.postBatchMutate(this);
6999         }
7000       } finally {
7001         // TODO: Make this method look like all other methods that are doing append/sync and
7002         // memstore rollback such as append and doMiniBatchMutation. Currently it is a little
7003         // different. Make them all share same code!
7004         if (!mutations.isEmpty() && !walSyncSuccessful) {
7005           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
7006               " memstore keyvalues for row(s):" + StringUtils.byteToHexString(
7007               processor.getRowsToLock().iterator().next()) + "...");
7008           for (Mutation m : mutations) {
7009             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
7010               Cell cell = cellScanner.current();
7011               getStore(cell).rollback(cell);
7012             }
7013           }
7014           if (writeEntry != null) {
7015             mvcc.complete(writeEntry);
7016             writeEntry = null;
7017           }
7018         }
7019         // 13. Roll mvcc forward
7020         if (writeEntry != null) {
7021           mvcc.completeAndWait(writeEntry);
7022         }
7023         if (locked) {
7024           this.updatesLock.readLock().unlock();
7025         }
7026         // release locks if some were acquired but another timed out
7027         releaseRowLocks(acquiredRowLocks);
7028       }
7029 
7030       // 14. Run post-process hook
7031       processor.postProcess(this, walEdit, walSyncSuccessful);
7032 
7033     } finally {
7034       closeRegionOperation();
7035       if (!mutations.isEmpty() &&
7036           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
7037         requestFlush();
7038       }
7039     }
7040   }
7041 
7042   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
7043                                        final long now,
7044                                        final HRegion region,
7045                                        final List<Mutation> mutations,
7046                                        final WALEdit walEdit,
7047                                        final long timeout) throws IOException {
7048     // Short circuit the no time bound case.
7049     if (timeout < 0) {
7050       try {
7051         processor.process(now, region, mutations, walEdit);
7052       } catch (IOException e) {
7053         LOG.warn("RowProcessor:" + processor.getClass().getName() +
7054             " throws Exception on row(s):" +
7055             Bytes.toStringBinary(
7056               processor.getRowsToLock().iterator().next()) + "...", e);
7057         throw e;
7058       }
7059       return;
7060     }
7061 
7062     // Case with time bound
7063     FutureTask<Void> task =
7064       new FutureTask<Void>(new Callable<Void>() {
7065         @Override
7066         public Void call() throws IOException {
7067           try {
7068             processor.process(now, region, mutations, walEdit);
7069             return null;
7070           } catch (IOException e) {
7071             LOG.warn("RowProcessor:" + processor.getClass().getName() +
7072                 " throws Exception on row(s):" +
7073                 Bytes.toStringBinary(
7074                     processor.getRowsToLock().iterator().next()) + "...", e);
7075             throw e;
7076           }
7077         }
7078       });
7079     rowProcessorExecutor.execute(task);
7080     try {
7081       task.get(timeout, TimeUnit.MILLISECONDS);
7082     } catch (TimeoutException te) {
7083       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
7084           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
7085           "...");
7086       throw new IOException(te);
7087     } catch (Exception e) {
7088       throw new IOException(e);
7089     }
7090   }
7091 
7092   /**
7093    * @return The passed-in {@code tags} but with the tags from {@code cell} added.
7094    */
7095   private static List<Tag> carryForwardTags(final Cell cell, final List<Tag> tags) {
7096     if (cell.getTagsLength() <= 0) return tags;
7097     List<Tag> newTags = tags == null? new ArrayList<Tag>(): /*Append Tags*/tags;
7098     Iterator<Tag> i = CellUtil.tagsIterator(cell);
7099     while (i.hasNext()) newTags.add(i.next());
7100     return newTags;
7101   }
7102 
7103   /**
7104    * Run a Get against passed in <code>store</code> on passed <code>row</code>, etc.
7105    * @return Get result.
7106    */
7107   private List<Cell> doGet(final Store store, final byte [] row,
7108       final Map.Entry<byte[], List<Cell>> family, final TimeRange tr)
7109   throws IOException {
7110     // Sort the cells so that they match the order that they
7111     // appear in the Get results. Otherwise, we won't be able to
7112     // find the existing values if the cells are not specified
7113     // in order by the client since cells are in an array list.
7114     Collections.sort(family.getValue(), store.getComparator());
7115     // Get previous values for all columns in this family
7116     Get get = new Get(row);
7117     for (Cell cell : family.getValue()) {
7118       get.addColumn(family.getKey(), CellUtil.cloneQualifier(cell));
7119     }
7120     if (tr != null) get.setTimeRange(tr.getMin(), tr.getMax());
7121     return get(get, false);
7122   }
7123 
7124   public Result append(Append append) throws IOException {
7125     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
7126   }
7127 
7128   // TODO: There's a lot of boiler plate code identical to increment.
7129   // We should refactor append and increment as local get-mutate-put
7130   // transactions, so all stores only go through one code path for puts.
7131 
7132   @Override
7133   public Result append(Append mutate, long nonceGroup, long nonce) throws IOException {
7134     Operation op = Operation.APPEND;
7135     byte[] row = mutate.getRow();
7136     checkRow(row, op.toString());
7137     checkFamilies(mutate.getFamilyCellMap().keySet());
7138     boolean flush = false;
7139     Durability durability = getEffectiveDurability(mutate.getDurability());
7140     boolean writeToWAL = durability != Durability.SKIP_WAL;
7141     WALEdit walEdits = null;
7142     List<Cell> allKVs = new ArrayList<Cell>(mutate.size());
7143     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
7144     long size = 0;
7145     long txid = 0;
7146     checkReadOnly();
7147     checkResources();
7148     // Lock row
7149     startRegionOperation(op);
7150     this.writeRequestsCount.increment();
7151     RowLock rowLock = null;
7152     WALKey walKey = null;
7153     MultiVersionConcurrencyControl.WriteEntry writeEntry = null;
7154     boolean doRollBackMemstore = false;
7155     try {
7156       rowLock = getRowLock(row);
7157       assert rowLock != null;
7158       try {
7159         lock(this.updatesLock.readLock());
7160         try {
7161           // Wait for all prior MVCC transactions to finish - while we hold the row lock
7162           // (so that we are guaranteed to see the latest state when we do our Get)
7163           mvcc.await();
7164           if (this.coprocessorHost != null) {
7165             Result r = this.coprocessorHost.preAppendAfterRowLock(mutate);
7166             if (r!= null) {
7167               return r;
7168             }
7169           }
7170           long now = EnvironmentEdgeManager.currentTime();
7171           // Process each family
7172           for (Map.Entry<byte[], List<Cell>> family : mutate.getFamilyCellMap().entrySet()) {
7173             Store store = stores.get(family.getKey());
7174             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
7175 
7176             List<Cell> results = doGet(store, row, family, null);
7177 
7178             // Iterate the input columns and update existing values if they were
7179             // found, otherwise add new column initialized to the append value
7180 
7181             // Avoid as much copying as possible. We may need to rewrite and
7182             // consolidate tags. Bytes are only copied once.
7183             // Would be nice if KeyValue had scatter/gather logic
7184             int idx = 0;
7185             for (Cell cell : family.getValue()) {
7186               Cell newCell;
7187               Cell oldCell = null;
7188               if (idx < results.size()
7189                   && CellUtil.matchingQualifier(results.get(idx), cell)) {
7190                 oldCell = results.get(idx);
7191                 long ts = Math.max(now, oldCell.getTimestamp());
7192 
7193                 // Process cell tags
7194                 // Make a union of the set of tags in the old and new KVs
7195                 List<Tag> newTags = carryForwardTags(oldCell, new ArrayList<Tag>());
7196                 newTags = carryForwardTags(cell, newTags);
7197 
7198                 // Cell TTL handling
7199 
7200                 if (mutate.getTTL() != Long.MAX_VALUE) {
7201                   // Add the new TTL tag
7202                   newTags.add(
7203                       new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(mutate.getTTL())));
7204                 }
7205 
7206                 // Rebuild tags
7207                 byte[] tagBytes = TagUtil.fromList(newTags);
7208 
7209                 // allocate an empty cell once
7210                 newCell = new KeyValue(row.length, cell.getFamilyLength(),
7211                     cell.getQualifierLength(), ts, KeyValue.Type.Put,
7212                     oldCell.getValueLength() + cell.getValueLength(),
7213                     tagBytes.length);
7214                 // copy in row, family, and qualifier
7215                 System.arraycopy(cell.getRowArray(), cell.getRowOffset(),
7216                   newCell.getRowArray(), newCell.getRowOffset(), cell.getRowLength());
7217                 System.arraycopy(cell.getFamilyArray(), cell.getFamilyOffset(),
7218                   newCell.getFamilyArray(), newCell.getFamilyOffset(),
7219                   cell.getFamilyLength());
7220                 System.arraycopy(cell.getQualifierArray(), cell.getQualifierOffset(),
7221                   newCell.getQualifierArray(), newCell.getQualifierOffset(),
7222                   cell.getQualifierLength());
7223                 // copy in the value
7224                 CellUtil.copyValueTo(oldCell, newCell.getValueArray(), newCell.getValueOffset());
7225                 System.arraycopy(cell.getValueArray(), cell.getValueOffset(),
7226                   newCell.getValueArray(),
7227                   newCell.getValueOffset() + oldCell.getValueLength(),
7228                   cell.getValueLength());
7229                 // Copy in tag data
7230                 System.arraycopy(tagBytes, 0, newCell.getTagsArray(), newCell.getTagsOffset(),
7231                   tagBytes.length);
7232                 idx++;
7233               } else {
7234                 // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP
7235                 CellUtil.updateLatestStamp(cell, now);
7236 
7237                 // Cell TTL handling
7238 
7239                 if (mutate.getTTL() != Long.MAX_VALUE) {
7240                   List<Tag> newTags = new ArrayList<Tag>(1);
7241                   newTags.add(
7242                       new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(mutate.getTTL())));
7243                   // Add the new TTL tag
7244                   newCell = new TagRewriteCell(cell, TagUtil.fromList(newTags));
7245                 } else {
7246                   newCell = cell;
7247                 }
7248               }
7249 
7250               // Give coprocessors a chance to update the new cell
7251               if (coprocessorHost != null) {
7252                 newCell = coprocessorHost.postMutationBeforeWAL(RegionObserver.MutationType.APPEND,
7253                     mutate, oldCell, newCell);
7254               }
7255               kvs.add(newCell);
7256 
7257               // Append update to WAL
7258               if (writeToWAL) {
7259                 if (walEdits == null) {
7260                   walEdits = new WALEdit();
7261                 }
7262                 walEdits.add(newCell);
7263               }
7264             }
7265 
7266             //store the kvs to the temporary memstore before writing WAL
7267             tempMemstore.put(store, kvs);
7268           }
7269 
7270           // Actually write to WAL now
7271           if (walEdits != null && !walEdits.isEmpty()) {
7272             if (writeToWAL) {
7273               // Using default cluster id, as this can only happen in the originating
7274               // cluster. A slave cluster receives the final value (not the delta)
7275               // as a Put.
7276               // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7277               walKey = new HLogKey(
7278                   getRegionInfo().getEncodedNameAsBytes(),
7279                   this.htableDescriptor.getTableName(),
7280                   WALKey.NO_SEQUENCE_ID,
7281                   nonceGroup,
7282                   nonce,
7283                   mvcc);
7284               txid =
7285                 this.wal.append(this.htableDescriptor, getRegionInfo(), walKey, walEdits, true);
7286             } else {
7287               recordMutationWithoutWal(mutate.getFamilyCellMap());
7288             }
7289           }
7290           if (walKey == null) {
7291             // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
7292             walKey = this.appendEmptyEdit(this.wal);
7293           }
7294 
7295           // now start my own transaction
7296           writeEntry = walKey.getWriteEntry();
7297 
7298 
7299           // Actually write to Memstore now
7300           if (!tempMemstore.isEmpty()) {
7301             for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
7302               Store store = entry.getKey();
7303               if (store.getFamily().getMaxVersions() == 1) {
7304                 // upsert if VERSIONS for this CF == 1
7305                 // Is this right? It immediately becomes visible? St.Ack 20150907
7306                 size += store.upsert(entry.getValue(), getSmallestReadPoint());
7307               } else {
7308                 // otherwise keep older versions around
7309                 for (Cell cell: entry.getValue()) {
7310                   CellUtil.setSequenceId(cell, writeEntry.getWriteNumber());
7311                   size += store.add(cell);
7312                   doRollBackMemstore = true;
7313                 }
7314               }
7315               // We add to all KVs here whereas when doing increment, we do it
7316               // earlier... why?
7317               allKVs.addAll(entry.getValue());
7318             }
7319 
7320             size = this.addAndGetGlobalMemstoreSize(size);
7321             flush = isFlushSize(size);
7322           }
7323         } finally {
7324           this.updatesLock.readLock().unlock();
7325         }
7326 
7327       } finally {
7328         rowLock.release();
7329         rowLock = null;
7330       }
7331       // sync the transaction log outside the rowlock
7332       if(txid != 0){
7333         syncOrDefer(txid, durability);
7334       }
7335       doRollBackMemstore = false;
7336     } finally {
7337       if (rowLock != null) {
7338         rowLock.release();
7339       }
7340       // if the wal sync was unsuccessful, remove keys from memstore
7341       if (doRollBackMemstore) {
7342         rollbackMemstore(allKVs);
7343         if (writeEntry != null) mvcc.complete(writeEntry);
7344       } else if (writeEntry != null) {
7345         mvcc.completeAndWait(writeEntry);
7346       }
7347 
7348       closeRegionOperation(op);
7349     }
7350 
7351     if (this.metricsRegion != null) {
7352       this.metricsRegion.updateAppend();
7353     }
7354 
7355     if (flush) {
7356       // Request a cache flush. Do it outside update lock.
7357       requestFlush();
7358     }
7359 
7360     return mutate.isReturnResults() ? Result.create(allKVs) : null;
7361   }
7362 
7363   public Result increment(Increment increment) throws IOException {
7364     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
7365   }
7366 
7367   // TODO: There's a lot of boiler plate code identical to append.
7368   // We should refactor append and increment as local get-mutate-put
7369   // transactions, so all stores only go through one code path for puts.
7370 
7371   // They are subtley different in quiet a few ways. This came out only
7372   // after study. I am not sure that many of the differences are intentional.
7373   // TODO: St.Ack 20150907
7374 
7375   @Override
7376   public Result increment(Increment mutation, long nonceGroup, long nonce)
7377   throws IOException {
7378     Operation op = Operation.INCREMENT;
7379     byte [] row = mutation.getRow();
7380     checkRow(row, op.toString());
7381     checkFamilies(mutation.getFamilyCellMap().keySet());
7382     boolean flush = false;
7383     Durability durability = getEffectiveDurability(mutation.getDurability());
7384     boolean writeToWAL = durability != Durability.SKIP_WAL;
7385     WALEdit walEdits = null;
7386     List<Cell> allKVs = new ArrayList<Cell>(mutation.size());
7387 
7388     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
7389     long size = 0;
7390     long txid = 0;
7391     checkReadOnly();
7392     checkResources();
7393     // Lock row
7394     startRegionOperation(op);
7395     this.writeRequestsCount.increment();
7396     RowLock rowLock = null;
7397     WALKey walKey = null;
7398     MultiVersionConcurrencyControl.WriteEntry writeEntry = null;
7399     boolean doRollBackMemstore = false;
7400     TimeRange tr = mutation.getTimeRange();
7401     try {
7402       rowLock = getRowLock(row);
7403       assert rowLock != null;
7404       try {
7405         lock(this.updatesLock.readLock());
7406         try {
7407           // wait for all prior MVCC transactions to finish - while we hold the row lock
7408           // (so that we are guaranteed to see the latest state)
7409           mvcc.await();
7410           if (this.coprocessorHost != null) {
7411             Result r = this.coprocessorHost.preIncrementAfterRowLock(mutation);
7412             if (r != null) {
7413               return r;
7414             }
7415           }
7416           long now = EnvironmentEdgeManager.currentTime();
7417           // Process each family
7418           for (Map.Entry<byte [], List<Cell>> family: mutation.getFamilyCellMap().entrySet()) {
7419             Store store = stores.get(family.getKey());
7420             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
7421 
7422             List<Cell> results = doGet(store, row, family, tr);
7423 
7424             // Iterate the input columns and update existing values if they were
7425             // found, otherwise add new column initialized to the increment amount
7426 
7427             // Avoid as much copying as possible. We may need to rewrite and
7428             // consolidate tags. Bytes are only copied once.
7429             // Would be nice if KeyValue had scatter/gather logic
7430             int idx = 0;
7431             // HERE WE DIVERGE FROM APPEND
7432             List<Cell> edits = family.getValue();
7433             for (int i = 0; i < edits.size(); i++) {
7434               Cell cell = edits.get(i);
7435               long amount = Bytes.toLong(CellUtil.cloneValue(cell));
7436               boolean noWriteBack = (amount == 0);
7437 
7438               List<Tag> newTags = carryForwardTags(cell, new ArrayList<Tag>());
7439 
7440               Cell c = null;
7441               long ts = now;
7442               if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), cell)) {
7443                 c = results.get(idx);
7444                 ts = Math.max(now, c.getTimestamp());
7445                 if(c.getValueLength() == Bytes.SIZEOF_LONG) {
7446                   amount += CellUtil.getValueAsLong(c);
7447                 } else {
7448                   // throw DoNotRetryIOException instead of IllegalArgumentException
7449                   throw new org.apache.hadoop.hbase.DoNotRetryIOException(
7450                       "Attempted to increment field that isn't 64 bits wide");
7451                 }
7452                 // Carry tags forward from previous version
7453                 newTags = carryForwardTags(c, newTags);
7454                 if (i < (edits.size() - 1) && !CellUtil.matchingQualifier(cell, edits.get(i + 1))) {
7455                   idx++;
7456                 }
7457               }
7458 
7459               // Append new incremented KeyValue to list
7460               byte[] q = CellUtil.cloneQualifier(cell);
7461               byte[] val = Bytes.toBytes(amount);
7462 
7463               // Add the TTL tag if the mutation carried one
7464               if (mutation.getTTL() != Long.MAX_VALUE) {
7465                 newTags.add(
7466                     new ArrayBackedTag(TagType.TTL_TAG_TYPE, Bytes.toBytes(mutation.getTTL())));
7467               }
7468 
7469               Cell newKV = new KeyValue(row, 0, row.length,
7470                 family.getKey(), 0, family.getKey().length,
7471                 q, 0, q.length,
7472                 ts,
7473                 KeyValue.Type.Put,
7474                 val, 0, val.length,
7475                 newTags);
7476 
7477               // Give coprocessors a chance to update the new cell
7478               if (coprocessorHost != null) {
7479                 newKV = coprocessorHost.postMutationBeforeWAL(
7480                     RegionObserver.MutationType.INCREMENT, mutation, c, newKV);
7481               }
7482               allKVs.add(newKV);
7483 
7484               if (!noWriteBack) {
7485                 kvs.add(newKV);
7486 
7487                 // Prepare WAL updates
7488                 if (writeToWAL) {
7489                   if (walEdits == null) {
7490                     walEdits = new WALEdit();
7491                   }
7492                   walEdits.add(newKV);
7493                 }
7494               }
7495             }
7496 
7497             //store the kvs to the temporary memstore before writing WAL
7498             if (!kvs.isEmpty()) {
7499               tempMemstore.put(store, kvs);
7500             }
7501           }
7502 
7503           // Actually write to WAL now
7504           if (walEdits != null && !walEdits.isEmpty()) {
7505             if (writeToWAL) {
7506               // Using default cluster id, as this can only happen in the originating
7507               // cluster. A slave cluster receives the final value (not the delta)
7508               // as a Put.
7509               // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7510               walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
7511                   this.htableDescriptor.getTableName(),
7512                   WALKey.NO_SEQUENCE_ID,
7513                   nonceGroup,
7514                   nonce,
7515                   mvcc);
7516               txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
7517                   walKey, walEdits, true);
7518             } else {
7519               recordMutationWithoutWal(mutation.getFamilyCellMap());
7520             }
7521           }
7522           if (walKey == null) {
7523             // Append a faked WALEdit in order for SKIP_WAL updates to get mvccNum assigned
7524             walKey = this.appendEmptyEdit(this.wal);
7525           }
7526 
7527           // now start my own transaction
7528           writeEntry = walKey.getWriteEntry();
7529 
7530           // Actually write to Memstore now
7531           if (!tempMemstore.isEmpty()) {
7532             for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
7533               Store store = entry.getKey();
7534               if (store.getFamily().getMaxVersions() == 1) {
7535                 // upsert if VERSIONS for this CF == 1
7536                 // Is this right? It immediately becomes visible? St.Ack 20150907
7537                 size += store.upsert(entry.getValue(), getSmallestReadPoint());
7538               } else {
7539                 // otherwise keep older versions around
7540                 for (Cell cell : entry.getValue()) {
7541                   CellUtil.setSequenceId(cell, writeEntry.getWriteNumber());
7542                   size += store.add(cell);
7543                   doRollBackMemstore = true;
7544                 }
7545               }
7546             }
7547             size = this.addAndGetGlobalMemstoreSize(size);
7548             flush = isFlushSize(size);
7549           }
7550         } finally {
7551           this.updatesLock.readLock().unlock();
7552         }
7553       } finally {
7554         rowLock.release();
7555         rowLock = null;
7556       }
7557       // sync the transaction log outside the rowlock
7558       if(txid != 0){
7559         syncOrDefer(txid, durability);
7560       }
7561       doRollBackMemstore = false;
7562     } finally {
7563       if (rowLock != null) {
7564         rowLock.release();
7565       }
7566       // if the wal sync was unsuccessful, remove keys from memstore
7567       if (doRollBackMemstore) {
7568         for(List<Cell> cells: tempMemstore.values()) {
7569           rollbackMemstore(cells);
7570         }
7571         if (writeEntry != null) mvcc.complete(writeEntry);
7572       } else if (writeEntry != null) {
7573         mvcc.completeAndWait(writeEntry);
7574       }
7575       closeRegionOperation(Operation.INCREMENT);
7576       if (this.metricsRegion != null) {
7577         this.metricsRegion.updateIncrement();
7578       }
7579     }
7580 
7581     if (flush) {
7582       // Request a cache flush.  Do it outside update lock.
7583       requestFlush();
7584     }
7585     return mutation.isReturnResults() ? Result.create(allKVs) : null;
7586   }
7587 
7588   //
7589   // New HBASE-880 Helpers
7590   //
7591 
7592   void checkFamily(final byte [] family)
7593   throws NoSuchColumnFamilyException {
7594     if (!this.htableDescriptor.hasFamily(family)) {
7595       throw new NoSuchColumnFamilyException("Column family " +
7596           Bytes.toString(family) + " does not exist in region " + this
7597           + " in table " + this.htableDescriptor);
7598     }
7599   }
7600 
7601   public static final long FIXED_OVERHEAD = ClassSize.align(
7602       ClassSize.OBJECT +
7603       ClassSize.ARRAY +
7604       45 * ClassSize.REFERENCE + 3 * Bytes.SIZEOF_INT +
7605       (14 * Bytes.SIZEOF_LONG) +
7606       5 * Bytes.SIZEOF_BOOLEAN);
7607 
7608   // woefully out of date - currently missing:
7609   // 1 x HashMap - coprocessorServiceHandlers
7610   // 6 x Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
7611   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
7612   //   writeRequestsCount
7613   // 1 x HRegion$WriteState - writestate
7614   // 1 x RegionCoprocessorHost - coprocessorHost
7615   // 1 x RegionSplitPolicy - splitPolicy
7616   // 1 x MetricsRegion - metricsRegion
7617   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
7618   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
7619       ClassSize.OBJECT + // closeLock
7620       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
7621       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
7622       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
7623       WriteState.HEAP_SIZE + // writestate
7624       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
7625       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
7626       MultiVersionConcurrencyControl.FIXED_SIZE // mvcc
7627       + ClassSize.TREEMAP // maxSeqIdInStores
7628       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
7629       ;
7630 
7631   @Override
7632   public long heapSize() {
7633     long heapSize = DEEP_OVERHEAD;
7634     for (Store store : this.stores.values()) {
7635       heapSize += store.heapSize();
7636     }
7637     // this does not take into account row locks, recent flushes, mvcc entries, and more
7638     return heapSize;
7639   }
7640 
7641   /*
7642    * This method calls System.exit.
7643    * @param message Message to print out.  May be null.
7644    */
7645   private static void printUsageAndExit(final String message) {
7646     if (message != null && message.length() > 0) System.out.println(message);
7647     System.out.println("Usage: HRegion CATALOG_TABLE_DIR [major_compact]");
7648     System.out.println("Options:");
7649     System.out.println(" major_compact  Pass this option to major compact " +
7650       "passed region.");
7651     System.out.println("Default outputs scan of passed region.");
7652     System.exit(1);
7653   }
7654 
7655   @Override
7656   public boolean registerService(Service instance) {
7657     /*
7658      * No stacking of instances is allowed for a single service name
7659      */
7660     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
7661     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
7662       LOG.error("Coprocessor service " + serviceDesc.getFullName() +
7663               " already registered, rejecting request from " + instance
7664       );
7665       return false;
7666     }
7667 
7668     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
7669     if (LOG.isDebugEnabled()) {
7670       LOG.debug("Registered coprocessor service: region=" +
7671           Bytes.toStringBinary(getRegionInfo().getRegionName()) +
7672           " service=" + serviceDesc.getFullName());
7673     }
7674     return true;
7675   }
7676 
7677   @Override
7678   public Message execService(RpcController controller, CoprocessorServiceCall call)
7679       throws IOException {
7680     String serviceName = call.getServiceName();
7681     String methodName = call.getMethodName();
7682     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
7683       throw new UnknownProtocolException(null,
7684           "No registered coprocessor service found for name "+serviceName+
7685           " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7686     }
7687 
7688     Service service = coprocessorServiceHandlers.get(serviceName);
7689     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
7690     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
7691     if (methodDesc == null) {
7692       throw new UnknownProtocolException(service.getClass(),
7693           "Unknown method "+methodName+" called on service "+serviceName+
7694               " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7695     }
7696 
7697     Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType();
7698     ProtobufUtil.mergeFrom(builder, call.getRequest());
7699     Message request = builder.build();
7700 
7701     if (coprocessorHost != null) {
7702       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
7703     }
7704 
7705     final Message.Builder responseBuilder =
7706         service.getResponsePrototype(methodDesc).newBuilderForType();
7707     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
7708       @Override
7709       public void run(Message message) {
7710         if (message != null) {
7711           responseBuilder.mergeFrom(message);
7712         }
7713       }
7714     });
7715 
7716     if (coprocessorHost != null) {
7717       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
7718     }
7719 
7720     IOException exception = ResponseConverter.getControllerException(controller);
7721     if (exception != null) {
7722       throw exception;
7723     }
7724 
7725     return responseBuilder.build();
7726   }
7727 
7728   /*
7729    * Process table.
7730    * Do major compaction or list content.
7731    * @throws IOException
7732    */
7733   private static void processTable(final FileSystem fs, final Path p,
7734       final WALFactory walFactory, final Configuration c,
7735       final boolean majorCompact)
7736   throws IOException {
7737     HRegion region;
7738     FSTableDescriptors fst = new FSTableDescriptors(c);
7739     // Currently expects tables have one region only.
7740     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
7741       final WAL wal = walFactory.getMetaWAL(
7742           HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes());
7743       region = HRegion.newHRegion(p, wal, fs, c,
7744         HRegionInfo.FIRST_META_REGIONINFO,
7745           fst.get(TableName.META_TABLE_NAME), null);
7746     } else {
7747       throw new IOException("Not a known catalog table: " + p.toString());
7748     }
7749     try {
7750       region.mvcc.advanceTo(region.initialize(null));
7751       if (majorCompact) {
7752         region.compact(true);
7753       } else {
7754         // Default behavior
7755         Scan scan = new Scan();
7756         // scan.addFamily(HConstants.CATALOG_FAMILY);
7757         RegionScanner scanner = region.getScanner(scan);
7758         try {
7759           List<Cell> kvs = new ArrayList<Cell>();
7760           boolean done;
7761           do {
7762             kvs.clear();
7763             done = scanner.next(kvs);
7764             if (kvs.size() > 0) LOG.info(kvs);
7765           } while (done);
7766         } finally {
7767           scanner.close();
7768         }
7769       }
7770     } finally {
7771       region.close();
7772     }
7773   }
7774 
7775   boolean shouldForceSplit() {
7776     return this.splitRequest;
7777   }
7778 
7779   byte[] getExplicitSplitPoint() {