View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InterruptedIOException;
25  import java.io.UnsupportedEncodingException;
26  import java.lang.reflect.Constructor;
27  import java.text.ParseException;
28  import java.util.AbstractList;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashMap;
34  import java.util.HashSet;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.NavigableMap;
40  import java.util.NavigableSet;
41  import java.util.RandomAccess;
42  import java.util.Set;
43  import java.util.TreeMap;
44  import java.util.concurrent.Callable;
45  import java.util.concurrent.CompletionService;
46  import java.util.concurrent.ConcurrentHashMap;
47  import java.util.concurrent.ConcurrentMap;
48  import java.util.concurrent.ConcurrentSkipListMap;
49  import java.util.concurrent.CountDownLatch;
50  import java.util.concurrent.ExecutionException;
51  import java.util.concurrent.ExecutorCompletionService;
52  import java.util.concurrent.ExecutorService;
53  import java.util.concurrent.Executors;
54  import java.util.concurrent.Future;
55  import java.util.concurrent.FutureTask;
56  import java.util.concurrent.ThreadFactory;
57  import java.util.concurrent.ThreadPoolExecutor;
58  import java.util.concurrent.TimeUnit;
59  import java.util.concurrent.TimeoutException;
60  import java.util.concurrent.atomic.AtomicBoolean;
61  import java.util.concurrent.atomic.AtomicInteger;
62  import java.util.concurrent.atomic.AtomicLong;
63  import java.util.concurrent.locks.Lock;
64  import java.util.concurrent.locks.ReentrantReadWriteLock;
65  
66  import org.apache.commons.logging.Log;
67  import org.apache.commons.logging.LogFactory;
68  import org.apache.hadoop.conf.Configuration;
69  import org.apache.hadoop.fs.FileStatus;
70  import org.apache.hadoop.fs.FileSystem;
71  import org.apache.hadoop.fs.Path;
72  import org.apache.hadoop.hbase.Cell;
73  import org.apache.hadoop.hbase.CellScanner;
74  import org.apache.hadoop.hbase.CellUtil;
75  import org.apache.hadoop.hbase.CompoundConfiguration;
76  import org.apache.hadoop.hbase.DroppedSnapshotException;
77  import org.apache.hadoop.hbase.HBaseConfiguration;
78  import org.apache.hadoop.hbase.HColumnDescriptor;
79  import org.apache.hadoop.hbase.HConstants;
80  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
81  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
82  import org.apache.hadoop.hbase.HRegionInfo;
83  import org.apache.hadoop.hbase.HTableDescriptor;
84  import org.apache.hadoop.hbase.KeyValue;
85  import org.apache.hadoop.hbase.KeyValueUtil;
86  import org.apache.hadoop.hbase.NamespaceDescriptor;
87  import org.apache.hadoop.hbase.NotServingRegionException;
88  import org.apache.hadoop.hbase.RegionTooBusyException;
89  import org.apache.hadoop.hbase.TableName;
90  import org.apache.hadoop.hbase.Tag;
91  import org.apache.hadoop.hbase.TagType;
92  import org.apache.hadoop.hbase.UnknownScannerException;
93  import org.apache.hadoop.hbase.backup.HFileArchiver;
94  import org.apache.hadoop.hbase.classification.InterfaceAudience;
95  import org.apache.hadoop.hbase.client.Append;
96  import org.apache.hadoop.hbase.client.Delete;
97  import org.apache.hadoop.hbase.client.Durability;
98  import org.apache.hadoop.hbase.client.Get;
99  import org.apache.hadoop.hbase.client.Increment;
100 import org.apache.hadoop.hbase.client.IsolationLevel;
101 import org.apache.hadoop.hbase.client.Mutation;
102 import org.apache.hadoop.hbase.client.Put;
103 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
104 import org.apache.hadoop.hbase.client.Result;
105 import org.apache.hadoop.hbase.client.RowMutations;
106 import org.apache.hadoop.hbase.client.Scan;
107 import org.apache.hadoop.hbase.conf.ConfigurationManager;
108 import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
109 import org.apache.hadoop.hbase.coprocessor.RegionObserver;
110 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
111 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
112 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
113 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
114 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
115 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
116 import org.apache.hadoop.hbase.filter.FilterWrapper;
117 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
118 import org.apache.hadoop.hbase.io.HeapSize;
119 import org.apache.hadoop.hbase.io.TimeRange;
120 import org.apache.hadoop.hbase.io.hfile.BlockCache;
121 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
122 import org.apache.hadoop.hbase.io.hfile.HFile;
123 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
124 import org.apache.hadoop.hbase.ipc.RpcCallContext;
125 import org.apache.hadoop.hbase.ipc.RpcServer;
126 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
127 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
128 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
129 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
130 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
131 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
132 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
133 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
134 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
135 import org.apache.hadoop.hbase.protobuf.generated.WALProtos;
136 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
137 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor;
138 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
139 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
140 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor;
141 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
142 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.StoreDescriptor;
143 import org.apache.hadoop.hbase.regionserver.InternalScanner.NextState;
144 import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl.WriteEntry;
145 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
146 import org.apache.hadoop.hbase.regionserver.compactions.CompactionThroughputController;
147 import org.apache.hadoop.hbase.regionserver.compactions.NoLimitCompactionThroughputController;
148 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
149 import org.apache.hadoop.hbase.regionserver.wal.ReplayHLogKey;
150 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
151 import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
152 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
153 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
154 import org.apache.hadoop.hbase.util.ByteStringer;
155 import org.apache.hadoop.hbase.util.Bytes;
156 import org.apache.hadoop.hbase.util.CancelableProgressable;
157 import org.apache.hadoop.hbase.util.ClassSize;
158 import org.apache.hadoop.hbase.util.CompressionTest;
159 import org.apache.hadoop.hbase.util.Counter;
160 import org.apache.hadoop.hbase.util.EncryptionTest;
161 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
162 import org.apache.hadoop.hbase.util.FSTableDescriptors;
163 import org.apache.hadoop.hbase.util.FSUtils;
164 import org.apache.hadoop.hbase.util.HashedBytes;
165 import org.apache.hadoop.hbase.util.Pair;
166 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
167 import org.apache.hadoop.hbase.util.Threads;
168 import org.apache.hadoop.hbase.wal.WAL;
169 import org.apache.hadoop.hbase.wal.WALFactory;
170 import org.apache.hadoop.hbase.wal.WALKey;
171 import org.apache.hadoop.hbase.wal.WALSplitter;
172 import org.apache.hadoop.hbase.wal.WALSplitter.MutationReplay;
173 import org.apache.hadoop.io.MultipleIOException;
174 import org.apache.hadoop.util.StringUtils;
175 import org.apache.htrace.Trace;
176 import org.apache.htrace.TraceScope;
177 
178 import com.google.common.annotations.VisibleForTesting;
179 import com.google.common.base.Optional;
180 import com.google.common.base.Preconditions;
181 import com.google.common.collect.Lists;
182 import com.google.common.collect.Maps;
183 import com.google.common.io.Closeables;
184 import com.google.protobuf.ByteString;
185 import com.google.protobuf.Descriptors;
186 import com.google.protobuf.Message;
187 import com.google.protobuf.RpcCallback;
188 import com.google.protobuf.RpcController;
189 import com.google.protobuf.Service;
190 import com.google.protobuf.TextFormat;
191 
192 /**
193  * HRegion stores data for a certain region of a table.  It stores all columns
194  * for each row. A given table consists of one or more HRegions.
195  *
196  * <p>We maintain multiple HStores for a single HRegion.
197  *
198  * <p>An Store is a set of rows with some column data; together,
199  * they make up all the data for the rows.
200  *
201  * <p>Each HRegion has a 'startKey' and 'endKey'.
202  * <p>The first is inclusive, the second is exclusive (except for
203  * the final region)  The endKey of region 0 is the same as
204  * startKey for region 1 (if it exists).  The startKey for the
205  * first region is null. The endKey for the final region is null.
206  *
207  * <p>Locking at the HRegion level serves only one purpose: preventing the
208  * region from being closed (and consequently split) while other operations
209  * are ongoing. Each row level operation obtains both a row lock and a region
210  * read lock for the duration of the operation. While a scanner is being
211  * constructed, getScanner holds a read lock. If the scanner is successfully
212  * constructed, it holds a read lock until it is closed. A close takes out a
213  * write lock and consequently will block for ongoing operations and will block
214  * new operations from starting while the close is in progress.
215  *
216  * <p>An HRegion is defined by its table and its key extent.
217  *
218  * <p>It consists of at least one Store.  The number of Stores should be
219  * configurable, so that data which is accessed together is stored in the same
220  * Store.  Right now, we approximate that by building a single Store for
221  * each column family.  (This config info will be communicated via the
222  * tabledesc.)
223  *
224  * <p>The HTableDescriptor contains metainfo about the HRegion's table.
225  * regionName is a unique identifier for this HRegion. (startKey, endKey]
226  * defines the keyspace for this HRegion.
227  */
228 @InterfaceAudience.Private
229 public class HRegion implements HeapSize, PropagatingConfigurationObserver { // , Writable{
230   public static final Log LOG = LogFactory.getLog(HRegion.class);
231 
232   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
233       "hbase.hregion.scan.loadColumnFamiliesOnDemand";
234 
235   /**
236    * This is the global default value for durability. All tables/mutations not
237    * defining a durability or using USE_DEFAULT will default to this value.
238    */
239   private static final Durability DEFAULT_DURABILITY = Durability.SYNC_WAL;
240 
241   final AtomicBoolean closed = new AtomicBoolean(false);
242   /* Closing can take some time; use the closing flag if there is stuff we don't
243    * want to do while in closing state; e.g. like offer this region up to the
244    * master as a region to close if the carrying regionserver is overloaded.
245    * Once set, it is never cleared.
246    */
247   final AtomicBoolean closing = new AtomicBoolean(false);
248 
249   /**
250    * The max sequence id of flushed data on this region.  Used doing some rough calculations on
251    * whether time to flush or not.
252    */
253   private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
254 
255   /**
256    * Record the sequence id of last flush operation.
257    */
258   private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
259   /**
260    * Region scoped edit sequence Id. Edits to this region are GUARANTEED to appear in the WAL
261    * file in this sequence id's order; i.e. edit #2 will be in the WAL after edit #1.
262    * Its default value is -1L. This default is used as a marker to indicate
263    * that the region hasn't opened yet. Once it is opened, it is set to the derived
264    * {@link #openSeqNum}, the largest sequence id of all hfiles opened under this Region.
265    *
266    * <p>Control of this sequence is handed off to the WAL implementation.  It is responsible
267    * for tagging edits with the correct sequence id since it is responsible for getting the
268    * edits into the WAL files. It controls updating the sequence id value.  DO NOT UPDATE IT
269    * OUTSIDE OF THE WAL.  The value you get will not be what you think it is.
270    */
271   private final AtomicLong sequenceId = new AtomicLong(-1L);
272 
273   /**
274    * The sequence id of the last replayed open region event from the primary region. This is used
275    * to skip entries before this due to the possibility of replay edits coming out of order from
276    * replication.
277    */
278   protected volatile long lastReplayedOpenRegionSeqId = -1L;
279   protected volatile long lastReplayedCompactionSeqId = -1L;
280 
281   /**
282    * Operation enum is used in {@link HRegion#startRegionOperation} to provide operation context for
283    * startRegionOperation to possibly invoke different checks before any region operations. Not all
284    * operations have to be defined here. It's only needed when a special check is need in
285    * startRegionOperation
286    */
287   public enum Operation {
288     ANY, GET, PUT, DELETE, SCAN, APPEND, INCREMENT, SPLIT_REGION, MERGE_REGION, BATCH_MUTATE,
289     REPLAY_BATCH_MUTATE, COMPACT_REGION, REPLAY_EVENT
290   }
291 
292   //////////////////////////////////////////////////////////////////////////////
293   // Members
294   //////////////////////////////////////////////////////////////////////////////
295 
296   // map from a locked row to the context for that lock including:
297   // - CountDownLatch for threads waiting on that row
298   // - the thread that owns the lock (allow reentrancy)
299   // - reference count of (reentrant) locks held by the thread
300   // - the row itself
301   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
302       new ConcurrentHashMap<HashedBytes, RowLockContext>();
303 
304   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
305       Bytes.BYTES_RAWCOMPARATOR);
306 
307   // TODO: account for each registered handler in HeapSize computation
308   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
309 
310   public final AtomicLong memstoreSize = new AtomicLong(0);
311 
312   // Debug possible data loss due to WAL off
313   final Counter numMutationsWithoutWAL = new Counter();
314   final Counter dataInMemoryWithoutWAL = new Counter();
315 
316   // Debug why CAS operations are taking a while.
317   final Counter checkAndMutateChecksPassed = new Counter();
318   final Counter checkAndMutateChecksFailed = new Counter();
319 
320   //Number of requests
321   final Counter readRequestsCount = new Counter();
322   final Counter writeRequestsCount = new Counter();
323 
324   // Number of requests blocked by memstore size.
325   private final Counter blockedRequestsCount = new Counter();
326 
327   /**
328    * @return the number of blocked requests count.
329    */
330   public long getBlockedRequestsCount() {
331     return this.blockedRequestsCount.get();
332   }
333 
334   // Compaction counters
335   final AtomicLong compactionsFinished = new AtomicLong(0L);
336   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
337   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
338 
339 
340   private final WAL wal;
341   private final HRegionFileSystem fs;
342   protected final Configuration conf;
343   private final Configuration baseConf;
344   private final KeyValue.KVComparator comparator;
345   private final int rowLockWaitDuration;
346   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
347 
348   // The internal wait duration to acquire a lock before read/update
349   // from the region. It is not per row. The purpose of this wait time
350   // is to avoid waiting a long time while the region is busy, so that
351   // we can release the IPC handler soon enough to improve the
352   // availability of the region server. It can be adjusted by
353   // tuning configuration "hbase.busy.wait.duration".
354   final long busyWaitDuration;
355   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
356 
357   // If updating multiple rows in one call, wait longer,
358   // i.e. waiting for busyWaitDuration * # of rows. However,
359   // we can limit the max multiplier.
360   final int maxBusyWaitMultiplier;
361 
362   // Max busy wait duration. There is no point to wait longer than the RPC
363   // purge timeout, when a RPC call will be terminated by the RPC engine.
364   final long maxBusyWaitDuration;
365 
366   // negative number indicates infinite timeout
367   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
368   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
369 
370   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
371 
372   /**
373    * The sequence ID that was encountered when this region was opened.
374    */
375   private long openSeqNum = HConstants.NO_SEQNUM;
376 
377   /**
378    * The default setting for whether to enable on-demand CF loading for
379    * scan requests to this region. Requests can override it.
380    */
381   private boolean isLoadingCfsOnDemandDefault = false;
382 
383   private final AtomicInteger majorInProgress = new AtomicInteger(0);
384   private final AtomicInteger minorInProgress = new AtomicInteger(0);
385 
386   //
387   // Context: During replay we want to ensure that we do not lose any data. So, we
388   // have to be conservative in how we replay wals. For each store, we calculate
389   // the maxSeqId up to which the store was flushed. And, skip the edits which
390   // are equal to or lower than maxSeqId for each store.
391   // The following map is populated when opening the region
392   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
393 
394   /** Saved state from replaying prepare flush cache */
395   private PrepareFlushResult prepareFlushResult = null;
396 
397   /**
398    * Config setting for whether to allow writes when a region is in recovering or not.
399    */
400   private boolean disallowWritesInRecovering = false;
401 
402   // when a region is in recovering state, it can only accept writes not reads
403   private volatile boolean isRecovering = false;
404 
405   private volatile Optional<ConfigurationManager> configurationManager;
406 
407   /**
408    * @return The smallest mvcc readPoint across all the scanners in this
409    * region. Writes older than this readPoint, are included  in every
410    * read operation.
411    */
412   public long getSmallestReadPoint() {
413     long minimumReadPoint;
414     // We need to ensure that while we are calculating the smallestReadPoint
415     // no new RegionScanners can grab a readPoint that we are unaware of.
416     // We achieve this by synchronizing on the scannerReadPoints object.
417     synchronized(scannerReadPoints) {
418       minimumReadPoint = mvcc.memstoreReadPoint();
419 
420       for (Long readPoint: this.scannerReadPoints.values()) {
421         if (readPoint < minimumReadPoint) {
422           minimumReadPoint = readPoint;
423         }
424       }
425     }
426     return minimumReadPoint;
427   }
428   /*
429    * Data structure of write state flags used coordinating flushes,
430    * compactions and closes.
431    */
432   static class WriteState {
433     // Set while a memstore flush is happening.
434     volatile boolean flushing = false;
435     // Set when a flush has been requested.
436     volatile boolean flushRequested = false;
437     // Number of compactions running.
438     volatile int compacting = 0;
439     // Gets set in close. If set, cannot compact or flush again.
440     volatile boolean writesEnabled = true;
441     // Set if region is read-only
442     volatile boolean readOnly = false;
443     // whether the reads are enabled. This is different than readOnly, because readOnly is
444     // static in the lifetime of the region, while readsEnabled is dynamic
445     volatile boolean readsEnabled = true;
446 
447     /**
448      * Set flags that make this region read-only.
449      *
450      * @param onOff flip value for region r/o setting
451      */
452     synchronized void setReadOnly(final boolean onOff) {
453       this.writesEnabled = !onOff;
454       this.readOnly = onOff;
455     }
456 
457     boolean isReadOnly() {
458       return this.readOnly;
459     }
460 
461     boolean isFlushRequested() {
462       return this.flushRequested;
463     }
464 
465     void setReadsEnabled(boolean readsEnabled) {
466       this.readsEnabled = readsEnabled;
467     }
468 
469     static final long HEAP_SIZE = ClassSize.align(
470         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
471   }
472 
473   /**
474    * Objects from this class are created when flushing to describe all the different states that
475    * that method ends up in. The Result enum describes those states. The sequence id should only
476    * be specified if the flush was successful, and the failure message should only be specified
477    * if it didn't flush.
478    */
479   public static class FlushResult {
480     enum Result {
481       FLUSHED_NO_COMPACTION_NEEDED,
482       FLUSHED_COMPACTION_NEEDED,
483       // Special case where a flush didn't run because there's nothing in the memstores. Used when
484       // bulk loading to know when we can still load even if a flush didn't happen.
485       CANNOT_FLUSH_MEMSTORE_EMPTY,
486       CANNOT_FLUSH
487       // Be careful adding more to this enum, look at the below methods to make sure
488     }
489 
490     final Result result;
491     final String failureReason;
492     final long flushSequenceId;
493     final boolean wroteFlushWalMarker;
494 
495     /**
496      * Convenience constructor to use when the flush is successful, the failure message is set to
497      * null.
498      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
499      * @param flushSequenceId Generated sequence id that comes right after the edits in the
500      *                        memstores.
501      */
502     FlushResult(Result result, long flushSequenceId) {
503       this(result, flushSequenceId, null, false);
504       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
505           .FLUSHED_COMPACTION_NEEDED;
506     }
507 
508     /**
509      * Convenience constructor to use when we cannot flush.
510      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
511      * @param failureReason Reason why we couldn't flush.
512      */
513     FlushResult(Result result, String failureReason, boolean wroteFlushMarker) {
514       this(result, -1, failureReason, wroteFlushMarker);
515       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
516     }
517 
518     /**
519      * Constructor with all the parameters.
520      * @param result Any of the Result.
521      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
522      * @param failureReason Reason why we couldn't flush, or null.
523      */
524     FlushResult(Result result, long flushSequenceId, String failureReason,
525       boolean wroteFlushMarker) {
526       this.result = result;
527       this.flushSequenceId = flushSequenceId;
528       this.failureReason = failureReason;
529       this.wroteFlushWalMarker = wroteFlushMarker;
530     }
531 
532     /**
533      * Convenience method, the equivalent of checking if result is
534      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
535      * @return true if the memstores were flushed, else false.
536      */
537     public boolean isFlushSucceeded() {
538       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
539           .FLUSHED_COMPACTION_NEEDED;
540     }
541 
542     /**
543      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
544      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
545      */
546     public boolean isCompactionNeeded() {
547       return result == Result.FLUSHED_COMPACTION_NEEDED;
548     }
549 
550     @Override
551     public String toString() {
552       return new StringBuilder()
553         .append("flush result:").append(result).append(", ")
554         .append("failureReason:").append(failureReason).append(",")
555         .append("flush seq id").append(flushSequenceId).toString();
556     }
557   }
558 
559   /** A result object from prepare flush cache stage */
560   @VisibleForTesting
561   static class PrepareFlushResult {
562     final FlushResult result; // indicating a failure result from prepare
563     final TreeMap<byte[], StoreFlushContext> storeFlushCtxs;
564     final TreeMap<byte[], List<Path>> committedFiles;
565     final long startTime;
566     final long flushOpSeqId;
567     final long flushedSeqId;
568     final long totalFlushableSize;
569 
570     /** Constructs an early exit case */
571     PrepareFlushResult(FlushResult result, long flushSeqId) {
572       this(result, null, null, Math.max(0, flushSeqId), 0, 0, 0);
573     }
574 
575     /** Constructs a successful prepare flush result */
576     PrepareFlushResult(
577       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
578       TreeMap<byte[], List<Path>> committedFiles, long startTime, long flushSeqId,
579       long flushedSeqId, long totalFlushableSize) {
580       this(null, storeFlushCtxs, committedFiles, startTime,
581         flushSeqId, flushedSeqId, totalFlushableSize);
582     }
583 
584     private PrepareFlushResult(
585       FlushResult result,
586       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
587       TreeMap<byte[], List<Path>> committedFiles, long startTime, long flushSeqId,
588       long flushedSeqId, long totalFlushableSize) {
589       this.result = result;
590       this.storeFlushCtxs = storeFlushCtxs;
591       this.committedFiles = committedFiles;
592       this.startTime = startTime;
593       this.flushOpSeqId = flushSeqId;
594       this.flushedSeqId = flushedSeqId;
595       this.totalFlushableSize = totalFlushableSize;
596     }
597   }
598 
599   final WriteState writestate = new WriteState();
600 
601   long memstoreFlushSize;
602   final long timestampSlop;
603   final long rowProcessorTimeout;
604 
605   // Last flush time for each Store. Useful when we are flushing for each column
606   private final ConcurrentMap<Store, Long> lastStoreFlushTimeMap =
607       new ConcurrentHashMap<Store, Long>();
608 
609   final RegionServerServices rsServices;
610   private RegionServerAccounting rsAccounting;
611   private long flushCheckInterval;
612   // flushPerChanges is to prevent too many changes in memstore
613   private long flushPerChanges;
614   private long blockingMemStoreSize;
615   final long threadWakeFrequency;
616   // Used to guard closes
617   final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
618 
619   // Stop updates lock
620   private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
621   private boolean splitRequest;
622   private byte[] explicitSplitPoint = null;
623 
624   private final MultiVersionConsistencyControl mvcc =
625       new MultiVersionConsistencyControl();
626 
627   // Coprocessor host
628   private RegionCoprocessorHost coprocessorHost;
629 
630   private HTableDescriptor htableDescriptor = null;
631   private RegionSplitPolicy splitPolicy;
632   private FlushPolicy flushPolicy;
633 
634   private final MetricsRegion metricsRegion;
635   private final MetricsRegionWrapperImpl metricsRegionWrapper;
636   private final Durability durability;
637   private final boolean regionStatsEnabled;
638 
639   /**
640    * HRegion constructor. This constructor should only be used for testing and
641    * extensions.  Instances of HRegion should be instantiated with the
642    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
643    *
644    * @param tableDir qualified path of directory where region should be located,
645    * usually the table directory.
646    * @param wal The WAL is the outbound log for any updates to the HRegion
647    * The wal file is a logfile from the previous execution that's
648    * custom-computed for this HRegion. The HRegionServer computes and sorts the
649    * appropriate wal info for this HRegion. If there is a previous wal file
650    * (implying that the HRegion has been written-to before), then read it from
651    * the supplied path.
652    * @param fs is the filesystem.
653    * @param confParam is global configuration settings.
654    * @param regionInfo - HRegionInfo that describes the region
655    * is new), then read them from the supplied path.
656    * @param htd the table descriptor
657    * @param rsServices reference to {@link RegionServerServices} or null
658    */
659   @Deprecated
660   public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
661       final Configuration confParam, final HRegionInfo regionInfo,
662       final HTableDescriptor htd, final RegionServerServices rsServices) {
663     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
664       wal, confParam, htd, rsServices);
665   }
666 
667   /**
668    * HRegion constructor. This constructor should only be used for testing and
669    * extensions.  Instances of HRegion should be instantiated with the
670    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
671    *
672    * @param fs is the filesystem.
673    * @param wal The WAL is the outbound log for any updates to the HRegion
674    * The wal file is a logfile from the previous execution that's
675    * custom-computed for this HRegion. The HRegionServer computes and sorts the
676    * appropriate wal info for this HRegion. If there is a previous wal file
677    * (implying that the HRegion has been written-to before), then read it from
678    * the supplied path.
679    * @param confParam is global configuration settings.
680    * @param htd the table descriptor
681    * @param rsServices reference to {@link RegionServerServices} or null
682    */
683   public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
684       final HTableDescriptor htd, final RegionServerServices rsServices) {
685     if (htd == null) {
686       throw new IllegalArgumentException("Need table descriptor");
687     }
688 
689     if (confParam instanceof CompoundConfiguration) {
690       throw new IllegalArgumentException("Need original base configuration");
691     }
692 
693     this.comparator = fs.getRegionInfo().getComparator();
694     this.wal = wal;
695     this.fs = fs;
696 
697     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
698     this.baseConf = confParam;
699     this.conf = new CompoundConfiguration()
700       .add(confParam)
701       .addStringMap(htd.getConfiguration())
702       .addBytesMap(htd.getValues());
703     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
704         DEFAULT_CACHE_FLUSH_INTERVAL);
705     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
706     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
707       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
708           + MAX_FLUSH_PER_CHANGES);
709     }
710     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
711                     DEFAULT_ROWLOCK_WAIT_DURATION);
712 
713     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
714     this.htableDescriptor = htd;
715     this.rsServices = rsServices;
716     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
717     setHTableSpecificConf();
718     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
719 
720     this.busyWaitDuration = conf.getLong(
721       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
722     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
723     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
724       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
725         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
726         + maxBusyWaitMultiplier + "). Their product should be positive");
727     }
728     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
729       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
730 
731     /*
732      * timestamp.slop provides a server-side constraint on the timestamp. This
733      * assumes that you base your TS around currentTimeMillis(). In this case,
734      * throw an error to the user if the user-specified TS is newer than now +
735      * slop. LATEST_TIMESTAMP == don't use this functionality
736      */
737     this.timestampSlop = conf.getLong(
738         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
739         HConstants.LATEST_TIMESTAMP);
740 
741     /**
742      * Timeout for the process time in processRowsWithLocks().
743      * Use -1 to switch off time bound.
744      */
745     this.rowProcessorTimeout = conf.getLong(
746         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
747     this.durability = htd.getDurability() == Durability.USE_DEFAULT
748         ? DEFAULT_DURABILITY
749         : htd.getDurability();
750     if (rsServices != null) {
751       this.rsAccounting = this.rsServices.getRegionServerAccounting();
752       // don't initialize coprocessors if not running within a regionserver
753       // TODO: revisit if coprocessors should load in other cases
754       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
755       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
756       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
757 
758       Map<String, HRegion> recoveringRegions = rsServices.getRecoveringRegions();
759       String encodedName = getRegionInfo().getEncodedName();
760       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
761         this.isRecovering = true;
762         recoveringRegions.put(encodedName, this);
763       }
764     } else {
765       this.metricsRegionWrapper = null;
766       this.metricsRegion = null;
767     }
768     if (LOG.isDebugEnabled()) {
769       // Write out region name as string and its encoded name.
770       LOG.debug("Instantiated " + this);
771     }
772 
773     // by default, we allow writes against a region when it's in recovering
774     this.disallowWritesInRecovering =
775         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
776           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
777     configurationManager = Optional.absent();
778 
779     // disable stats tracking system tables, but check the config for everything else
780     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
781         NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
782           false :
783           conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
784               HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
785   }
786 
787   void setHTableSpecificConf() {
788     if (this.htableDescriptor == null) return;
789     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
790 
791     if (flushSize <= 0) {
792       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
793         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
794     }
795     this.memstoreFlushSize = flushSize;
796     this.blockingMemStoreSize = this.memstoreFlushSize *
797         conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
798   }
799 
800   /**
801    * Initialize this region.
802    * Used only by tests and SplitTransaction to reopen the region.
803    * You should use createHRegion() or openHRegion()
804    * @return What the next sequence (edit) id should be.
805    * @throws IOException e
806    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
807    */
808   @Deprecated
809   public long initialize() throws IOException {
810     return initialize(null);
811   }
812 
813   /**
814    * Initialize this region.
815    *
816    * @param reporter Tickle every so often if initialize is taking a while.
817    * @return What the next sequence (edit) id should be.
818    * @throws IOException e
819    */
820   private long initialize(final CancelableProgressable reporter) throws IOException {
821     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
822     long nextSeqId = -1;
823     try {
824       nextSeqId = initializeRegionInternals(reporter, status);
825       return nextSeqId;
826     } finally {
827       // nextSeqid will be -1 if the initialization fails.
828       // At least it will be 0 otherwise.
829       if (nextSeqId == -1) {
830         status
831             .abort("Exception during region " + this.getRegionNameAsString() + " initialization.");
832       }
833     }
834   }
835 
836   private long initializeRegionInternals(final CancelableProgressable reporter,
837       final MonitoredTask status) throws IOException {
838     if (coprocessorHost != null) {
839       status.setStatus("Running coprocessor pre-open hook");
840       coprocessorHost.preOpen();
841     }
842 
843     // Write HRI to a file in case we need to recover hbase:meta
844     status.setStatus("Writing region info on filesystem");
845     fs.checkRegionInfoOnFilesystem();
846 
847 
848 
849     // Initialize all the HStores
850     status.setStatus("Initializing all the Stores");
851     long maxSeqId = initializeRegionStores(reporter, status);
852     this.lastReplayedOpenRegionSeqId = maxSeqId;
853 
854     this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
855     this.writestate.flushRequested = false;
856     this.writestate.compacting = 0;
857 
858     if (this.writestate.writesEnabled) {
859       // Remove temporary data left over from old regions
860       status.setStatus("Cleaning up temporary data from old regions");
861       fs.cleanupTempDir();
862     }
863 
864     if (this.writestate.writesEnabled) {
865       status.setStatus("Cleaning up detritus from prior splits");
866       // Get rid of any splits or merges that were lost in-progress.  Clean out
867       // these directories here on open.  We may be opening a region that was
868       // being split but we crashed in the middle of it all.
869       fs.cleanupAnySplitDetritus();
870       fs.cleanupMergesDir();
871     }
872 
873     // Initialize split policy
874     this.splitPolicy = RegionSplitPolicy.create(this, conf);
875 
876     // Initialize flush policy
877     this.flushPolicy = FlushPolicyFactory.create(this, conf);
878 
879     long lastFlushTime = EnvironmentEdgeManager.currentTime();
880     for (Store store: stores.values()) {
881       this.lastStoreFlushTimeMap.put(store, lastFlushTime);
882     }
883 
884     // Use maximum of log sequenceid or that which was found in stores
885     // (particularly if no recovered edits, seqid will be -1).
886     long nextSeqid = maxSeqId;
887 
888     // In distributedLogReplay mode, we don't know the last change sequence number because region
889     // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
890     // overlaps used sequence numbers
891     if (this.writestate.writesEnabled) {
892       nextSeqid = WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs
893           .getRegionDir(), nextSeqid, (this.isRecovering ? (this.flushPerChanges + 10000000) : 1));
894     } else {
895       nextSeqid++;
896     }
897 
898     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
899       "; next sequenceid=" + nextSeqid);
900 
901     // A region can be reopened if failed a split; reset flags
902     this.closing.set(false);
903     this.closed.set(false);
904 
905     if (coprocessorHost != null) {
906       status.setStatus("Running coprocessor post-open hooks");
907       coprocessorHost.postOpen();
908     }
909 
910     status.markComplete("Region opened successfully");
911     return nextSeqid;
912   }
913 
914   private long initializeRegionStores(final CancelableProgressable reporter, MonitoredTask status)
915       throws IOException {
916     // Load in all the HStores.
917 
918     long maxSeqId = -1;
919     // initialized to -1 so that we pick up MemstoreTS from column families
920     long maxMemstoreTS = -1;
921 
922     if (!htableDescriptor.getFamilies().isEmpty()) {
923       // initialize the thread pool for opening stores in parallel.
924       ThreadPoolExecutor storeOpenerThreadPool =
925         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
926       CompletionService<HStore> completionService =
927         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
928 
929       // initialize each store in parallel
930       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
931         status.setStatus("Instantiating store for column family " + family);
932         completionService.submit(new Callable<HStore>() {
933           @Override
934           public HStore call() throws IOException {
935             return instantiateHStore(family);
936           }
937         });
938       }
939       boolean allStoresOpened = false;
940       try {
941         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
942           Future<HStore> future = completionService.take();
943           HStore store = future.get();
944           this.stores.put(store.getColumnFamilyName().getBytes(), store);
945 
946           long storeMaxSequenceId = store.getMaxSequenceId();
947           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
948               storeMaxSequenceId);
949           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
950             maxSeqId = storeMaxSequenceId;
951           }
952           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
953           if (maxStoreMemstoreTS > maxMemstoreTS) {
954             maxMemstoreTS = maxStoreMemstoreTS;
955           }
956         }
957         allStoresOpened = true;
958       } catch (InterruptedException e) {
959         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
960       } catch (ExecutionException e) {
961         throw new IOException(e.getCause());
962       } finally {
963         storeOpenerThreadPool.shutdownNow();
964         if (!allStoresOpened) {
965           // something went wrong, close all opened stores
966           LOG.error("Could not initialize all stores for the region=" + this);
967           for (Store store : this.stores.values()) {
968             try {
969               store.close();
970             } catch (IOException e) {
971               LOG.warn(e.getMessage());
972             }
973           }
974         }
975       }
976     }
977     if (ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) {
978       // Recover any edits if available.
979       maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
980           this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
981     }
982     maxSeqId = Math.max(maxSeqId, maxMemstoreTS + 1);
983     mvcc.initialize(maxSeqId);
984     return maxSeqId;
985   }
986 
987   private void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
988     Map<byte[], List<Path>> storeFiles
989     = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
990     for (Map.Entry<byte[], Store> entry : getStores().entrySet()) {
991       Store store = entry.getValue();
992       ArrayList<Path> storeFileNames = new ArrayList<Path>();
993       for (StoreFile storeFile: store.getStorefiles()) {
994         storeFileNames.add(storeFile.getPath());
995       }
996       storeFiles.put(entry.getKey(), storeFileNames);
997     }
998 
999     RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
1000       RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
1001       getRegionServerServices().getServerName(), storeFiles);
1002     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionOpenDesc,
1003       getSequenceId());
1004   }
1005 
1006   private void writeRegionCloseMarker(WAL wal) throws IOException {
1007     Map<byte[], List<Path>> storeFiles
1008     = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
1009     for (Map.Entry<byte[], Store> entry : getStores().entrySet()) {
1010       Store store = entry.getValue();
1011       ArrayList<Path> storeFileNames = new ArrayList<Path>();
1012       for (StoreFile storeFile: store.getStorefiles()) {
1013         storeFileNames.add(storeFile.getPath());
1014       }
1015       storeFiles.put(entry.getKey(), storeFileNames);
1016     }
1017 
1018     RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
1019       RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), getSequenceId().get(),
1020       getRegionServerServices().getServerName(), storeFiles);
1021     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionEventDesc,
1022       getSequenceId());
1023 
1024     // Store SeqId in HDFS when a region closes
1025     // checking region folder exists is due to many tests which delete the table folder while a
1026     // table is still online
1027     if (this.fs.getFileSystem().exists(this.fs.getRegionDir())) {
1028       WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs.getRegionDir(),
1029         getSequenceId().get(), 0);
1030     }
1031   }
1032 
1033   /**
1034    * @return True if this region has references.
1035    */
1036   public boolean hasReferences() {
1037     for (Store store : this.stores.values()) {
1038       if (store.hasReferences()) return true;
1039     }
1040     return false;
1041   }
1042 
1043   /**
1044    * This function will return the HDFS blocks distribution based on the data
1045    * captured when HFile is created
1046    * @return The HDFS blocks distribution for the region.
1047    */
1048   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
1049     HDFSBlocksDistribution hdfsBlocksDistribution =
1050       new HDFSBlocksDistribution();
1051     synchronized (this.stores) {
1052       for (Store store : this.stores.values()) {
1053         for (StoreFile sf : store.getStorefiles()) {
1054           HDFSBlocksDistribution storeFileBlocksDistribution =
1055             sf.getHDFSBlockDistribution();
1056           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
1057         }
1058       }
1059     }
1060     return hdfsBlocksDistribution;
1061   }
1062 
1063   /**
1064    * This is a helper function to compute HDFS block distribution on demand
1065    * @param conf configuration
1066    * @param tableDescriptor HTableDescriptor of the table
1067    * @param regionInfo encoded name of the region
1068    * @return The HDFS blocks distribution for the given region.
1069    * @throws IOException
1070    */
1071   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1072       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
1073     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
1074     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
1075   }
1076 
1077   /**
1078    * This is a helper function to compute HDFS block distribution on demand
1079    * @param conf configuration
1080    * @param tableDescriptor HTableDescriptor of the table
1081    * @param regionInfo encoded name of the region
1082    * @param tablePath the table directory
1083    * @return The HDFS blocks distribution for the given region.
1084    * @throws IOException
1085    */
1086   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1087       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
1088       throws IOException {
1089     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1090     FileSystem fs = tablePath.getFileSystem(conf);
1091 
1092     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
1093     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
1094       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
1095       if (storeFiles == null) continue;
1096 
1097       for (StoreFileInfo storeFileInfo : storeFiles) {
1098         hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
1099       }
1100     }
1101     return hdfsBlocksDistribution;
1102   }
1103 
1104   public AtomicLong getMemstoreSize() {
1105     return memstoreSize;
1106   }
1107 
1108   /**
1109    * Increase the size of mem store in this region and the size of global mem
1110    * store
1111    * @return the size of memstore in this region
1112    */
1113   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
1114     if (this.rsAccounting != null) {
1115       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
1116     }
1117     return this.memstoreSize.addAndGet(memStoreSize);
1118   }
1119 
1120   /** @return a HRegionInfo object for this region */
1121   public HRegionInfo getRegionInfo() {
1122     return this.fs.getRegionInfo();
1123   }
1124 
1125   /**
1126    * @return Instance of {@link RegionServerServices} used by this HRegion.
1127    * Can be null.
1128    */
1129   RegionServerServices getRegionServerServices() {
1130     return this.rsServices;
1131   }
1132 
1133   /** @return readRequestsCount for this region */
1134   long getReadRequestsCount() {
1135     return this.readRequestsCount.get();
1136   }
1137 
1138   /** @return writeRequestsCount for this region */
1139   long getWriteRequestsCount() {
1140     return this.writeRequestsCount.get();
1141   }
1142 
1143   public MetricsRegion getMetrics() {
1144     return metricsRegion;
1145   }
1146 
1147   /** @return true if region is closed */
1148   public boolean isClosed() {
1149     return this.closed.get();
1150   }
1151 
1152   /**
1153    * @return True if closing process has started.
1154    */
1155   public boolean isClosing() {
1156     return this.closing.get();
1157   }
1158 
1159   /**
1160    * Reset recovering state of current region
1161    */
1162   public void setRecovering(boolean newState) {
1163     boolean wasRecovering = this.isRecovering;
1164     // before we flip the recovering switch (enabling reads) we should write the region open
1165     // event to WAL if needed
1166     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
1167         && wasRecovering && !newState) {
1168 
1169       // force a flush only if region replication is set up for this region. Otherwise no need.
1170       boolean forceFlush = getTableDesc().getRegionReplication() > 1;
1171 
1172       // force a flush first
1173       MonitoredTask status = TaskMonitor.get().createStatus(
1174         "Flushing region " + this + " because recovery is finished");
1175       try {
1176         if (forceFlush) {
1177           internalFlushcache(status);
1178         }
1179 
1180         status.setStatus("Writing region open event marker to WAL because recovery is finished");
1181         try {
1182           long seqId = openSeqNum;
1183           // obtain a new seqId because we possibly have writes and flushes on top of openSeqNum
1184           if (wal != null) {
1185             seqId = getNextSequenceId(wal);
1186           }
1187           writeRegionOpenMarker(wal, seqId);
1188         } catch (IOException e) {
1189           // We cannot rethrow this exception since we are being called from the zk thread. The
1190           // region has already opened. In this case we log the error, but continue
1191           LOG.warn(getRegionInfo().getEncodedName() + " : was not able to write region opening "
1192               + "event to WAL, continueing", e);
1193         }
1194       } catch (IOException ioe) {
1195         // Distributed log replay semantics does not necessarily require a flush, since the replayed
1196         // data is already written again in the WAL. So failed flush should be fine.
1197         LOG.warn(getRegionInfo().getEncodedName() + " : was not able to flush "
1198             + "event to WAL, continueing", ioe);
1199       } finally {
1200         status.cleanup();
1201       }
1202     }
1203 
1204     this.isRecovering = newState;
1205     if (wasRecovering && !isRecovering) {
1206       // Call only when wal replay is over.
1207       coprocessorHost.postLogReplay();
1208     }
1209   }
1210 
1211   /**
1212    * @return True if current region is in recovering
1213    */
1214   public boolean isRecovering() {
1215     return this.isRecovering;
1216   }
1217 
1218   /** @return true if region is available (not closed and not closing) */
1219   public boolean isAvailable() {
1220     return !isClosed() && !isClosing();
1221   }
1222 
1223   /** @return true if region is splittable */
1224   public boolean isSplittable() {
1225     return isAvailable() && !hasReferences();
1226   }
1227 
1228   /**
1229    * @return true if region is mergeable
1230    */
1231   public boolean isMergeable() {
1232     if (!isAvailable()) {
1233       LOG.debug("Region " + this.getRegionNameAsString()
1234           + " is not mergeable because it is closing or closed");
1235       return false;
1236     }
1237     if (hasReferences()) {
1238       LOG.debug("Region " + this.getRegionNameAsString()
1239           + " is not mergeable because it has references");
1240       return false;
1241     }
1242 
1243     return true;
1244   }
1245 
1246   public boolean areWritesEnabled() {
1247     synchronized(this.writestate) {
1248       return this.writestate.writesEnabled;
1249     }
1250   }
1251 
1252    public MultiVersionConsistencyControl getMVCC() {
1253      return mvcc;
1254    }
1255 
1256    /*
1257     * Returns readpoint considering given IsolationLevel
1258     */
1259    public long getReadpoint(IsolationLevel isolationLevel) {
1260      if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1261        // This scan can read even uncommitted transactions
1262        return Long.MAX_VALUE;
1263      }
1264      return mvcc.memstoreReadPoint();
1265    }
1266 
1267    public boolean isLoadingCfsOnDemandDefault() {
1268      return this.isLoadingCfsOnDemandDefault;
1269    }
1270 
1271   /**
1272    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1273    * service any more calls.
1274    *
1275    * <p>This method could take some time to execute, so don't call it from a
1276    * time-sensitive thread.
1277    *
1278    * @return Vector of all the storage files that the HRegion's component
1279    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1280    * vector if already closed and null if judged that it should not close.
1281    *
1282    * @throws IOException e
1283    */
1284   public Map<byte[], List<StoreFile>> close() throws IOException {
1285     return close(false);
1286   }
1287 
1288   private final Object closeLock = new Object();
1289 
1290   /** Conf key for the periodic flush interval */
1291   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1292       "hbase.regionserver.optionalcacheflushinterval";
1293   /** Default interval for the memstore flush */
1294   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1295   public static final int META_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
1296 
1297   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1298   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1299       "hbase.regionserver.flush.per.changes";
1300   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1301   /**
1302    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1303    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1304    */
1305   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1306 
1307   /**
1308    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1309    * Shut down each HStore, don't service any more calls.
1310    *
1311    * This method could take some time to execute, so don't call it from a
1312    * time-sensitive thread.
1313    *
1314    * @param abort true if server is aborting (only during testing)
1315    * @return Vector of all the storage files that the HRegion's component
1316    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1317    * we are not to close at this time or we are already closed.
1318    *
1319    * @throws IOException e
1320    */
1321   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1322     // Only allow one thread to close at a time. Serialize them so dual
1323     // threads attempting to close will run up against each other.
1324     MonitoredTask status = TaskMonitor.get().createStatus(
1325         "Closing region " + this +
1326         (abort ? " due to abort" : ""));
1327 
1328     status.setStatus("Waiting for close lock");
1329     try {
1330       synchronized (closeLock) {
1331         return doClose(abort, status);
1332       }
1333     } finally {
1334       status.cleanup();
1335     }
1336   }
1337 
1338   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1339       throws IOException {
1340     if (isClosed()) {
1341       LOG.warn("Region " + this + " already closed");
1342       return null;
1343     }
1344 
1345     if (coprocessorHost != null) {
1346       status.setStatus("Running coprocessor pre-close hooks");
1347       this.coprocessorHost.preClose(abort);
1348     }
1349 
1350     status.setStatus("Disabling compacts and flushes for region");
1351     boolean canFlush = true;
1352     synchronized (writestate) {
1353       // Disable compacting and flushing by background threads for this
1354       // region.
1355       canFlush = !writestate.readOnly;
1356       writestate.writesEnabled = false;
1357       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1358       waitForFlushesAndCompactions();
1359     }
1360     // If we were not just flushing, is it worth doing a preflush...one
1361     // that will clear out of the bulk of the memstore before we put up
1362     // the close flag?
1363     if (!abort && worthPreFlushing() && canFlush) {
1364       status.setStatus("Pre-flushing region before close");
1365       LOG.info("Running close preflush of " + this.getRegionNameAsString());
1366       try {
1367         internalFlushcache(status);
1368       } catch (IOException ioe) {
1369         // Failed to flush the region. Keep going.
1370         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1371       }
1372     }
1373 
1374     this.closing.set(true);
1375     status.setStatus("Disabling writes for close");
1376     // block waiting for the lock for closing
1377     lock.writeLock().lock();
1378     try {
1379       if (this.isClosed()) {
1380         status.abort("Already got closed by another process");
1381         // SplitTransaction handles the null
1382         return null;
1383       }
1384       LOG.debug("Updates disabled for region " + this);
1385       // Don't flush the cache if we are aborting
1386       if (!abort && canFlush) {
1387         int flushCount = 0;
1388         while (this.getMemstoreSize().get() > 0) {
1389           try {
1390             if (flushCount++ > 0) {
1391               int actualFlushes = flushCount - 1;
1392               if (actualFlushes > 5) {
1393                 // If we tried 5 times and are unable to clear memory, abort
1394                 // so we do not lose data
1395                 throw new DroppedSnapshotException("Failed clearing memory after " +
1396                   actualFlushes + " attempts on region: " + Bytes.toStringBinary(getRegionName()));
1397               }
1398               LOG.info("Running extra flush, " + actualFlushes +
1399                 " (carrying snapshot?) " + this);
1400             }
1401             internalFlushcache(status);
1402           } catch (IOException ioe) {
1403             status.setStatus("Failed flush " + this + ", putting online again");
1404             synchronized (writestate) {
1405               writestate.writesEnabled = true;
1406             }
1407             // Have to throw to upper layers.  I can't abort server from here.
1408             throw ioe;
1409           }
1410         }
1411       }
1412 
1413       Map<byte[], List<StoreFile>> result =
1414         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1415       if (!stores.isEmpty()) {
1416         // initialize the thread pool for closing stores in parallel.
1417         ThreadPoolExecutor storeCloserThreadPool =
1418           getStoreOpenAndCloseThreadPool("StoreCloserThread-" + this.getRegionNameAsString());
1419         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1420           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1421 
1422         // close each store in parallel
1423         for (final Store store : stores.values()) {
1424           assert abort || store.getFlushableSize() == 0 || writestate.readOnly;
1425           completionService
1426               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1427                 @Override
1428                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1429                   return new Pair<byte[], Collection<StoreFile>>(
1430                     store.getFamily().getName(), store.close());
1431                 }
1432               });
1433         }
1434         try {
1435           for (int i = 0; i < stores.size(); i++) {
1436             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1437             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1438             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1439             if (familyFiles == null) {
1440               familyFiles = new ArrayList<StoreFile>();
1441               result.put(storeFiles.getFirst(), familyFiles);
1442             }
1443             familyFiles.addAll(storeFiles.getSecond());
1444           }
1445         } catch (InterruptedException e) {
1446           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1447         } catch (ExecutionException e) {
1448           throw new IOException(e.getCause());
1449         } finally {
1450           storeCloserThreadPool.shutdownNow();
1451         }
1452       }
1453 
1454       status.setStatus("Writing region close event to WAL");
1455       if (!abort && wal != null && getRegionServerServices() != null && !writestate.readOnly) {
1456         writeRegionCloseMarker(wal);
1457       }
1458 
1459       this.closed.set(true);
1460       if (!canFlush) {
1461         addAndGetGlobalMemstoreSize(-memstoreSize.get());
1462       } else if (memstoreSize.get() != 0) {
1463         LOG.error("Memstore size is " + memstoreSize.get());
1464       }
1465       if (coprocessorHost != null) {
1466         status.setStatus("Running coprocessor post-close hooks");
1467         this.coprocessorHost.postClose(abort);
1468       }
1469       if (this.metricsRegion != null) {
1470         this.metricsRegion.close();
1471       }
1472       if (this.metricsRegionWrapper != null) {
1473         Closeables.closeQuietly(this.metricsRegionWrapper);
1474       }
1475       status.markComplete("Closed");
1476       LOG.info("Closed " + this);
1477       return result;
1478     } finally {
1479       lock.writeLock().unlock();
1480     }
1481   }
1482 
1483   /**
1484    * Wait for all current flushes and compactions of the region to complete.
1485    * <p>
1486    * Exposed for TESTING.
1487    */
1488   public void waitForFlushesAndCompactions() {
1489     synchronized (writestate) {
1490       if (this.writestate.readOnly) {
1491         // we should not wait for replayed flushed if we are read only (for example in case the
1492         // region is a secondary replica).
1493         return;
1494       }
1495       boolean interrupted = false;
1496       try {
1497         while (writestate.compacting > 0 || writestate.flushing) {
1498           LOG.debug("waiting for " + writestate.compacting + " compactions"
1499             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1500           try {
1501             writestate.wait();
1502           } catch (InterruptedException iex) {
1503             // essentially ignore and propagate the interrupt back up
1504             LOG.warn("Interrupted while waiting");
1505             interrupted = true;
1506           }
1507         }
1508       } finally {
1509         if (interrupted) {
1510           Thread.currentThread().interrupt();
1511         }
1512       }
1513     }
1514   }
1515 
1516   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1517       final String threadNamePrefix) {
1518     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1519     int maxThreads = Math.min(numStores,
1520         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1521             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1522     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1523   }
1524 
1525   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1526       final String threadNamePrefix) {
1527     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1528     int maxThreads = Math.max(1,
1529         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1530             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1531             / numStores);
1532     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1533   }
1534 
1535   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1536       final String threadNamePrefix) {
1537     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1538       new ThreadFactory() {
1539         private int count = 1;
1540 
1541         @Override
1542         public Thread newThread(Runnable r) {
1543           return new Thread(r, threadNamePrefix + "-" + count++);
1544         }
1545       });
1546   }
1547 
1548    /**
1549     * @return True if its worth doing a flush before we put up the close flag.
1550     */
1551   private boolean worthPreFlushing() {
1552     return this.memstoreSize.get() >
1553       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1554   }
1555 
1556   //////////////////////////////////////////////////////////////////////////////
1557   // HRegion accessors
1558   //////////////////////////////////////////////////////////////////////////////
1559 
1560   /** @return start key for region */
1561   public byte [] getStartKey() {
1562     return this.getRegionInfo().getStartKey();
1563   }
1564 
1565   /** @return end key for region */
1566   public byte [] getEndKey() {
1567     return this.getRegionInfo().getEndKey();
1568   }
1569 
1570   /** @return region id */
1571   public long getRegionId() {
1572     return this.getRegionInfo().getRegionId();
1573   }
1574 
1575   /** @return region name */
1576   public byte [] getRegionName() {
1577     return this.getRegionInfo().getRegionName();
1578   }
1579 
1580   /** @return region name as string for logging */
1581   public String getRegionNameAsString() {
1582     return this.getRegionInfo().getRegionNameAsString();
1583   }
1584 
1585   /** @return HTableDescriptor for this region */
1586   public HTableDescriptor getTableDesc() {
1587     return this.htableDescriptor;
1588   }
1589 
1590   /** @return WAL in use for this region */
1591   public WAL getWAL() {
1592     return this.wal;
1593   }
1594 
1595   /**
1596    * @return split policy for this region.
1597    */
1598   public RegionSplitPolicy getSplitPolicy() {
1599     return this.splitPolicy;
1600   }
1601 
1602   /**
1603    * A split takes the config from the parent region & passes it to the daughter
1604    * region's constructor. If 'conf' was passed, you would end up using the HTD
1605    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1606    * to the daughter regions to avoid this tricky dedupe problem.
1607    * @return Configuration object
1608    */
1609   Configuration getBaseConf() {
1610     return this.baseConf;
1611   }
1612 
1613   /** @return {@link FileSystem} being used by this region */
1614   public FileSystem getFilesystem() {
1615     return fs.getFileSystem();
1616   }
1617 
1618   /** @return the {@link HRegionFileSystem} used by this region */
1619   public HRegionFileSystem getRegionFileSystem() {
1620     return this.fs;
1621   }
1622 
1623   /**
1624    * @return Returns the earliest time a store in the region was flushed. All
1625    *         other stores in the region would have been flushed either at, or
1626    *         after this time.
1627    */
1628   @VisibleForTesting
1629   public long getEarliestFlushTimeForAllStores() {
1630     return lastStoreFlushTimeMap.isEmpty() ? Long.MAX_VALUE : Collections.min(lastStoreFlushTimeMap
1631         .values());
1632   }
1633 
1634   /**
1635    * This can be used to determine the last time all files of this region were major compacted.
1636    * @param majorCompactioOnly Only consider HFile that are the result of major compaction
1637    * @return the timestamp of the oldest HFile for all stores of this region
1638    */
1639   public long getOldestHfileTs(boolean majorCompactioOnly) throws IOException {
1640     long result = Long.MAX_VALUE;
1641     for (Store store : getStores().values()) {
1642       for (StoreFile file : store.getStorefiles()) {
1643         HFile.Reader reader = file.getReader().getHFileReader();
1644         if (majorCompactioOnly) {
1645           byte[] val = reader.loadFileInfo().get(StoreFile.MAJOR_COMPACTION_KEY);
1646           if (val == null || !Bytes.toBoolean(val)) {
1647             continue;
1648           }
1649         }
1650         result = Math.min(result, reader.getFileContext().getFileCreateTime());
1651       }
1652     }
1653     return result == Long.MAX_VALUE ? 0 : result;
1654   }
1655 
1656   RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
1657     long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
1658     byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
1659     regionLoadBldr.clearStoreCompleteSequenceId();
1660     for (byte[] familyName : this.stores.keySet()) {
1661       long oldestUnflushedSeqId = this.wal.getEarliestMemstoreSeqNum(encodedRegionName, familyName);
1662       // no oldestUnflushedSeqId means no data has written to the store after last flush, so we use
1663       // lastFlushOpSeqId as complete sequence id for the store.
1664       regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId
1665           .newBuilder()
1666           .setFamilyName(ByteString.copyFrom(familyName))
1667           .setSequenceId(
1668             oldestUnflushedSeqId < 0 ? lastFlushOpSeqIdLocal : oldestUnflushedSeqId - 1).build());
1669     }
1670     return regionLoadBldr.setCompleteSequenceId(this.maxFlushedSeqId);
1671   }
1672 
1673   //////////////////////////////////////////////////////////////////////////////
1674   // HRegion maintenance.
1675   //
1676   // These methods are meant to be called periodically by the HRegionServer for
1677   // upkeep.
1678   //////////////////////////////////////////////////////////////////////////////
1679 
1680   /** @return returns size of largest HStore. */
1681   public long getLargestHStoreSize() {
1682     long size = 0;
1683     for (Store h : stores.values()) {
1684       long storeSize = h.getSize();
1685       if (storeSize > size) {
1686         size = storeSize;
1687       }
1688     }
1689     return size;
1690   }
1691 
1692   /**
1693    * @return KeyValue Comparator
1694    */
1695   public KeyValue.KVComparator getComparator() {
1696     return this.comparator;
1697   }
1698 
1699   /*
1700    * Do preparation for pending compaction.
1701    * @throws IOException
1702    */
1703   protected void doRegionCompactionPrep() throws IOException {
1704   }
1705 
1706   void triggerMajorCompaction() {
1707     for (Store h : stores.values()) {
1708       h.triggerMajorCompaction();
1709     }
1710   }
1711 
1712   /**
1713    * This is a helper function that compact all the stores synchronously
1714    * It is used by utilities and testing
1715    *
1716    * @param majorCompaction True to force a major compaction regardless of thresholds
1717    * @throws IOException e
1718    */
1719   public void compactStores(final boolean majorCompaction)
1720   throws IOException {
1721     if (majorCompaction) {
1722       this.triggerMajorCompaction();
1723     }
1724     compactStores();
1725   }
1726 
1727   /**
1728    * This is a helper function that compact all the stores synchronously
1729    * It is used by utilities and testing
1730    *
1731    * @throws IOException e
1732    */
1733   public void compactStores() throws IOException {
1734     for (Store s : getStores().values()) {
1735       CompactionContext compaction = s.requestCompaction();
1736       if (compaction != null) {
1737         compact(compaction, s, NoLimitCompactionThroughputController.INSTANCE);
1738       }
1739     }
1740   }
1741 
1742   /**
1743    * This is a helper function that compact the given store
1744    * It is used by utilities and testing
1745    *
1746    * @throws IOException e
1747    */
1748   @VisibleForTesting
1749   void compactStore(byte[] family, CompactionThroughputController throughputController)
1750       throws IOException {
1751     Store s = getStore(family);
1752     CompactionContext compaction = s.requestCompaction();
1753     if (compaction != null) {
1754       compact(compaction, s, throughputController);
1755     }
1756   }
1757 
1758   /*
1759    * Called by compaction thread and after region is opened to compact the
1760    * HStores if necessary.
1761    *
1762    * <p>This operation could block for a long time, so don't call it from a
1763    * time-sensitive thread.
1764    *
1765    * Note that no locking is necessary at this level because compaction only
1766    * conflicts with a region split, and that cannot happen because the region
1767    * server does them sequentially and not in parallel.
1768    *
1769    * @param compaction Compaction details, obtained by requestCompaction()
1770    * @return whether the compaction completed
1771    */
1772   public boolean compact(CompactionContext compaction, Store store,
1773       CompactionThroughputController throughputController) throws IOException {
1774     assert compaction != null && compaction.hasSelection();
1775     assert !compaction.getRequest().getFiles().isEmpty();
1776     if (this.closing.get() || this.closed.get()) {
1777       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1778       store.cancelRequestedCompaction(compaction);
1779       return false;
1780     }
1781     MonitoredTask status = null;
1782     boolean requestNeedsCancellation = true;
1783     // block waiting for the lock for compaction
1784     lock.readLock().lock();
1785     try {
1786       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1787       if (stores.get(cf) != store) {
1788         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1789             + " has been re-instantiated, cancel this compaction request. "
1790             + " It may be caused by the roll back of split transaction");
1791         return false;
1792       }
1793 
1794       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1795       if (this.closed.get()) {
1796         String msg = "Skipping compaction on " + this + " because closed";
1797         LOG.debug(msg);
1798         status.abort(msg);
1799         return false;
1800       }
1801       boolean wasStateSet = false;
1802       try {
1803         synchronized (writestate) {
1804           if (writestate.writesEnabled) {
1805             wasStateSet = true;
1806             ++writestate.compacting;
1807           } else {
1808             String msg = "NOT compacting region " + this + ". Writes disabled.";
1809             LOG.info(msg);
1810             status.abort(msg);
1811             return false;
1812           }
1813         }
1814         LOG.info("Starting compaction on " + store + " in region " + this
1815             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1816         doRegionCompactionPrep();
1817         try {
1818           status.setStatus("Compacting store " + store);
1819           // We no longer need to cancel the request on the way out of this
1820           // method because Store#compact will clean up unconditionally
1821           requestNeedsCancellation = false;
1822           store.compact(compaction, throughputController);
1823         } catch (InterruptedIOException iioe) {
1824           String msg = "compaction interrupted";
1825           LOG.info(msg, iioe);
1826           status.abort(msg);
1827           return false;
1828         }
1829       } finally {
1830         if (wasStateSet) {
1831           synchronized (writestate) {
1832             --writestate.compacting;
1833             if (writestate.compacting <= 0) {
1834               writestate.notifyAll();
1835             }
1836           }
1837         }
1838       }
1839       status.markComplete("Compaction complete");
1840       return true;
1841     } finally {
1842       try {
1843         if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
1844         if (status != null) status.cleanup();
1845       } finally {
1846         lock.readLock().unlock();
1847       }
1848     }
1849   }
1850 
1851   /**
1852    * Flush all stores.
1853    * <p>
1854    * See {@link #flushcache(boolean)}.
1855    *
1856    * @return whether the flush is success and whether the region needs compacting
1857    * @throws IOException
1858    */
1859   public FlushResult flushcache() throws IOException {
1860     return flushcache(true, false);
1861   }
1862 
1863   /**
1864    * Flush the cache.
1865    *
1866    * When this method is called the cache will be flushed unless:
1867    * <ol>
1868    *   <li>the cache is empty</li>
1869    *   <li>the region is closed.</li>
1870    *   <li>a flush is already in progress</li>
1871    *   <li>writes are disabled</li>
1872    * </ol>
1873    *
1874    * <p>This method may block for some time, so it should not be called from a
1875    * time-sensitive thread.
1876    * @param forceFlushAllStores whether we want to flush all stores
1877    * @return whether the flush is success and whether the region needs compacting
1878    *
1879    * @throws IOException general io exceptions
1880    * @throws DroppedSnapshotException Thrown when replay of wal is required
1881    * because a Snapshot was not properly persisted.
1882    */
1883   public FlushResult flushcache(boolean forceFlushAllStores) throws IOException {
1884     return flushcache(forceFlushAllStores, false);
1885   }
1886 
1887 
1888   /**
1889    * Flush the cache.
1890    *
1891    * When this method is called the cache will be flushed unless:
1892    * <ol>
1893    *   <li>the cache is empty</li>
1894    *   <li>the region is closed.</li>
1895    *   <li>a flush is already in progress</li>
1896    *   <li>writes are disabled</li>
1897    * </ol>
1898    *
1899    * <p>This method may block for some time, so it should not be called from a
1900    * time-sensitive thread.
1901    * @param forceFlushAllStores whether we want to flush all stores
1902    * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
1903    * @return whether the flush is success and whether the region needs compacting
1904    *
1905    * @throws IOException general io exceptions
1906    * @throws DroppedSnapshotException Thrown when replay of wal is required
1907    * because a Snapshot was not properly persisted.
1908    */
1909   public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
1910       throws IOException {
1911     // fail-fast instead of waiting on the lock
1912     if (this.closing.get()) {
1913       String msg = "Skipping flush on " + this + " because closing";
1914       LOG.debug(msg);
1915       return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg, false);
1916     }
1917     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
1918     status.setStatus("Acquiring readlock on region");
1919     // block waiting for the lock for flushing cache
1920     lock.readLock().lock();
1921     try {
1922       if (this.closed.get()) {
1923         String msg = "Skipping flush on " + this + " because closed";
1924         LOG.debug(msg);
1925         status.abort(msg);
1926         return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg, false);
1927       }
1928       if (coprocessorHost != null) {
1929         status.setStatus("Running coprocessor pre-flush hooks");
1930         coprocessorHost.preFlush();
1931       }
1932       // TODO: this should be managed within memstore with the snapshot, updated only after flush
1933       // successful
1934       if (numMutationsWithoutWAL.get() > 0) {
1935         numMutationsWithoutWAL.set(0);
1936         dataInMemoryWithoutWAL.set(0);
1937       }
1938       synchronized (writestate) {
1939         if (!writestate.flushing && writestate.writesEnabled) {
1940           this.writestate.flushing = true;
1941         } else {
1942           if (LOG.isDebugEnabled()) {
1943             LOG.debug("NOT flushing memstore for region " + this
1944                 + ", flushing=" + writestate.flushing + ", writesEnabled="
1945                 + writestate.writesEnabled);
1946           }
1947           String msg = "Not flushing since "
1948               + (writestate.flushing ? "already flushing"
1949               : "writes not enabled");
1950           status.abort(msg);
1951           return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg, false);
1952         }
1953       }
1954 
1955       try {
1956         Collection<Store> specificStoresToFlush =
1957             forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
1958         FlushResult fs = internalFlushcache(specificStoresToFlush,
1959           status, writeFlushRequestWalMarker);
1960 
1961         if (coprocessorHost != null) {
1962           status.setStatus("Running post-flush coprocessor hooks");
1963           coprocessorHost.postFlush();
1964         }
1965 
1966         status.markComplete("Flush successful");
1967         return fs;
1968       } finally {
1969         synchronized (writestate) {
1970           writestate.flushing = false;
1971           this.writestate.flushRequested = false;
1972           writestate.notifyAll();
1973         }
1974       }
1975     } finally {
1976       lock.readLock().unlock();
1977       status.cleanup();
1978     }
1979   }
1980 
1981   /**
1982    * Should the store be flushed because it is old enough.
1983    * <p>
1984    * Every FlushPolicy should call this to determine whether a store is old enough to flush(except
1985    * that you always flush all stores). Otherwise the {@link #shouldFlush()} method will always
1986    * returns true which will make a lot of flush requests.
1987    */
1988   boolean shouldFlushStore(Store store) {
1989     long maxFlushedSeqId =
1990         this.wal.getEarliestMemstoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), store
1991             .getFamily().getName()) - 1;
1992     if (maxFlushedSeqId > 0 && maxFlushedSeqId + flushPerChanges < sequenceId.get()) {
1993       if (LOG.isDebugEnabled()) {
1994         LOG.debug("Column Family: " + store.getColumnFamilyName() + " of region " + this
1995             + " will be flushed because its max flushed seqId(" + maxFlushedSeqId
1996             + ") is far away from current(" + sequenceId.get() + "), max allowed is "
1997             + flushPerChanges);
1998       }
1999       return true;
2000     }
2001     if (flushCheckInterval <= 0) {
2002       return false;
2003     }
2004     long now = EnvironmentEdgeManager.currentTime();
2005     if (store.timeOfOldestEdit() < now - flushCheckInterval) {
2006       if (LOG.isDebugEnabled()) {
2007         LOG.debug("Column Family: " + store.getColumnFamilyName() + " of region " + this
2008             + " will be flushed because time of its oldest edit (" + store.timeOfOldestEdit()
2009             + ") is far away from now(" + now + "), max allowed is " + flushCheckInterval);
2010       }
2011       return true;
2012     }
2013     return false;
2014   }
2015 
2016   /**
2017    * Should the memstore be flushed now
2018    */
2019   boolean shouldFlush() {
2020     // This is a rough measure.
2021     if (this.maxFlushedSeqId > 0
2022           && (this.maxFlushedSeqId + this.flushPerChanges < this.sequenceId.get())) {
2023       return true;
2024     }
2025     long modifiedFlushCheckInterval = flushCheckInterval;
2026     if (getRegionInfo().isMetaRegion() &&
2027         getRegionInfo().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2028       modifiedFlushCheckInterval = META_CACHE_FLUSH_INTERVAL;
2029     }
2030     if (modifiedFlushCheckInterval <= 0) { //disabled
2031       return false;
2032     }
2033     long now = EnvironmentEdgeManager.currentTime();
2034     //if we flushed in the recent past, we don't need to do again now
2035     if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
2036       return false;
2037     }
2038     //since we didn't flush in the recent past, flush now if certain conditions
2039     //are met. Return true on first such memstore hit.
2040     for (Store s : this.getStores().values()) {
2041       if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
2042         // we have an old enough edit in the memstore, flush
2043         return true;
2044       }
2045     }
2046     return false;
2047   }
2048 
2049   /**
2050    * Flushing all stores.
2051    *
2052    * @see #internalFlushcache(Collection, MonitoredTask, boolean)
2053    */
2054   private FlushResult internalFlushcache(MonitoredTask status)
2055       throws IOException {
2056     return internalFlushcache(stores.values(), status, false);
2057   }
2058 
2059   /**
2060    * Flushing given stores.
2061    *
2062    * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean)
2063    */
2064   private FlushResult internalFlushcache(final Collection<Store> storesToFlush,
2065       MonitoredTask status, boolean writeFlushWalMarker) throws IOException {
2066     return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush,
2067         status, writeFlushWalMarker);
2068   }
2069 
2070   /**
2071    * Flush the memstore. Flushing the memstore is a little tricky. We have a lot
2072    * of updates in the memstore, all of which have also been written to the wal.
2073    * We need to write those updates in the memstore out to disk, while being
2074    * able to process reads/writes as much as possible during the flush
2075    * operation.
2076    * <p>
2077    * This method may block for some time. Every time you call it, we up the
2078    * regions sequence id even if we don't flush; i.e. the returned region id
2079    * will be at least one larger than the last edit applied to this region. The
2080    * returned id does not refer to an actual edit. The returned id can be used
2081    * for say installing a bulk loaded file just ahead of the last hfile that was
2082    * the result of this flush, etc.
2083    *
2084    * @param wal
2085    *          Null if we're NOT to go via wal.
2086    * @param myseqid
2087    *          The seqid to use if <code>wal</code> is null writing out flush
2088    *          file.
2089    * @param storesToFlush
2090    *          The list of stores to flush.
2091    * @return object describing the flush's state
2092    * @throws IOException
2093    *           general io exceptions
2094    * @throws DroppedSnapshotException
2095    *           Thrown when replay of wal is required because a Snapshot was not
2096    *           properly persisted.
2097    */
2098   protected FlushResult internalFlushcache(final WAL wal, final long myseqid,
2099       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2100           throws IOException {
2101     PrepareFlushResult result
2102       = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker);
2103     if (result.result == null) {
2104       return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
2105     } else {
2106       return result.result; // early exit due to failure from prepare stage
2107     }
2108   }
2109 
2110   protected PrepareFlushResult internalPrepareFlushCache(
2111       final WAL wal, final long myseqid, final Collection<Store> storesToFlush,
2112       MonitoredTask status, boolean writeFlushWalMarker)
2113           throws IOException {
2114 
2115     if (this.rsServices != null && this.rsServices.isAborted()) {
2116       // Don't flush when server aborting, it's unsafe
2117       throw new IOException("Aborting flush because server is aborted...");
2118     }
2119     final long startTime = EnvironmentEdgeManager.currentTime();
2120     // If nothing to flush, return, but we need to safely update the region sequence id
2121     if (this.memstoreSize.get() <= 0) {
2122       // Take an update lock because am about to change the sequence id and we want the sequence id
2123       // to be at the border of the empty memstore.
2124       MultiVersionConsistencyControl.WriteEntry w = null;
2125       this.updatesLock.writeLock().lock();
2126       try {
2127         if (this.memstoreSize.get() <= 0) {
2128           // Presume that if there are still no edits in the memstore, then there are no edits for
2129           // this region out in the WAL subsystem so no need to do any trickery clearing out
2130           // edits in the WAL system. Up the sequence number so the resulting flush id is for
2131           // sure just beyond the last appended region edit (useful as a marker when bulk loading,
2132           // etc.)
2133           // wal can be null replaying edits.
2134           if (wal != null) {
2135             w = mvcc.beginMemstoreInsert();
2136             long flushOpSeqId = getNextSequenceId(wal);
2137             FlushResult flushResult = new FlushResult(
2138               FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, "Nothing to flush",
2139               writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
2140             w.setWriteNumber(flushOpSeqId);
2141             mvcc.waitForPreviousTransactionsComplete(w);
2142             w = null;
2143             return new PrepareFlushResult(flushResult, myseqid);
2144           } else {
2145             return new PrepareFlushResult(
2146               new FlushResult(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush",
2147                 false),
2148               myseqid);
2149           }
2150         }
2151       } finally {
2152         this.updatesLock.writeLock().unlock();
2153         if (w != null) {
2154           mvcc.advanceMemstore(w);
2155         }
2156       }
2157     }
2158 
2159     if (LOG.isInfoEnabled()) {
2160       LOG.info("Started memstore flush for " + this + ", current region memstore size "
2161           + StringUtils.byteDesc(this.memstoreSize.get()) + ", and " + storesToFlush.size() + "/"
2162           + stores.size() + " column families' memstores are being flushed."
2163           + ((wal != null) ? "" : "; wal is null, using passed sequenceid=" + myseqid));
2164       // only log when we are not flushing all stores.
2165       if (this.stores.size() > storesToFlush.size()) {
2166         for (Store store: storesToFlush) {
2167           LOG.info("Flushing Column Family: " + store.getColumnFamilyName()
2168               + " which was occupying "
2169               + StringUtils.byteDesc(store.getMemStoreSize()) + " of memstore.");
2170         }
2171       }
2172     }
2173     // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
2174     // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
2175     // allow updates again so its value will represent the size of the updates received
2176     // during flush
2177     MultiVersionConsistencyControl.WriteEntry w = null;
2178     // We have to take an update lock during snapshot, or else a write could end up in both snapshot
2179     // and memstore (makes it difficult to do atomic rows then)
2180     status.setStatus("Obtaining lock to block concurrent updates");
2181     // block waiting for the lock for internal flush
2182     this.updatesLock.writeLock().lock();
2183     status.setStatus("Preparing to flush by snapshotting stores in " +
2184       getRegionInfo().getEncodedName());
2185     long totalFlushableSizeOfFlushableStores = 0;
2186 
2187     Set<byte[]> flushedFamilyNames = new HashSet<byte[]>();
2188     for (Store store: storesToFlush) {
2189       flushedFamilyNames.add(store.getFamily().getName());
2190     }
2191 
2192     TreeMap<byte[], StoreFlushContext> storeFlushCtxs
2193       = new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
2194     TreeMap<byte[], List<Path>> committedFiles = new TreeMap<byte[], List<Path>>(
2195         Bytes.BYTES_COMPARATOR);
2196     // The sequence id of this flush operation which is used to log FlushMarker and pass to
2197     // createFlushContext to use as the store file's sequence id.
2198     long flushOpSeqId = HConstants.NO_SEQNUM;
2199     // The max flushed sequence id after this flush operation. Used as completeSequenceId which is
2200     // passed to HMaster.
2201     long flushedSeqId = HConstants.NO_SEQNUM;
2202     byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
2203 
2204     long trxId = 0;
2205     try {
2206       try {
2207         w = mvcc.beginMemstoreInsert();
2208         if (wal != null) {
2209           if (!wal.startCacheFlush(encodedRegionName, flushedFamilyNames)) {
2210             // This should never happen.
2211             String msg = "Flush will not be started for ["
2212                 + this.getRegionInfo().getEncodedName() + "] - because the WAL is closing.";
2213             status.setStatus(msg);
2214             return new PrepareFlushResult(
2215               new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg, false),
2216               myseqid);
2217           }
2218           flushOpSeqId = getNextSequenceId(wal);
2219           long oldestUnflushedSeqId = wal.getEarliestMemstoreSeqNum(encodedRegionName);
2220           // no oldestUnflushedSeqId means we flushed all stores.
2221           // or the unflushed stores are all empty.
2222           flushedSeqId = (oldestUnflushedSeqId == HConstants.NO_SEQNUM) ? flushOpSeqId
2223               : oldestUnflushedSeqId - 1;
2224         } else {
2225           // use the provided sequence Id as WAL is not being used for this flush.
2226           flushedSeqId = flushOpSeqId = myseqid;
2227         }
2228 
2229         for (Store s : storesToFlush) {
2230           totalFlushableSizeOfFlushableStores += s.getFlushableSize();
2231           storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId));
2232           committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL
2233         }
2234 
2235         // write the snapshot start to WAL
2236         if (wal != null && !writestate.readOnly) {
2237           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
2238             getRegionInfo(), flushOpSeqId, committedFiles);
2239           // no sync. Sync is below where we do not hold the updates lock
2240           trxId = WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2241             desc, sequenceId, false);
2242         }
2243 
2244         // Prepare flush (take a snapshot)
2245         for (StoreFlushContext flush : storeFlushCtxs.values()) {
2246           flush.prepare();
2247         }
2248       } catch (IOException ex) {
2249         if (wal != null) {
2250           if (trxId > 0) { // check whether we have already written START_FLUSH to WAL
2251             try {
2252               FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2253                 getRegionInfo(), flushOpSeqId, committedFiles);
2254               WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2255                 desc, sequenceId, false);
2256             } catch (Throwable t) {
2257               LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
2258                   StringUtils.stringifyException(t));
2259               // ignore this since we will be aborting the RS with DSE.
2260             }
2261           }
2262           // we have called wal.startCacheFlush(), now we have to abort it
2263           wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2264           throw ex; // let upper layers deal with it.
2265         }
2266       } finally {
2267         this.updatesLock.writeLock().unlock();
2268       }
2269       String s = "Finished memstore snapshotting " + this +
2270         ", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSizeOfFlushableStores;
2271       status.setStatus(s);
2272       if (LOG.isTraceEnabled()) LOG.trace(s);
2273       // sync unflushed WAL changes
2274       // see HBASE-8208 for details
2275       if (wal != null) {
2276         try {
2277           wal.sync(); // ensure that flush marker is sync'ed
2278         } catch (IOException ioe) {
2279           LOG.warn("Unexpected exception while wal.sync(), ignoring. Exception: "
2280               + StringUtils.stringifyException(ioe));
2281         }
2282       }
2283 
2284       // wait for all in-progress transactions to commit to WAL before
2285       // we can start the flush. This prevents
2286       // uncommitted transactions from being written into HFiles.
2287       // We have to block before we start the flush, otherwise keys that
2288       // were removed via a rollbackMemstore could be written to Hfiles.
2289       w.setWriteNumber(flushOpSeqId);
2290       mvcc.waitForPreviousTransactionsComplete(w);
2291       // set w to null to prevent mvcc.advanceMemstore from being called again inside finally block
2292       w = null;
2293     } finally {
2294       if (w != null) {
2295         // in case of failure just mark current w as complete
2296         mvcc.advanceMemstore(w);
2297       }
2298     }
2299     return new PrepareFlushResult(storeFlushCtxs, committedFiles, startTime, flushOpSeqId,
2300       flushedSeqId, totalFlushableSizeOfFlushableStores);
2301   }
2302 
2303   /**
2304    * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
2305    * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
2306    * @param wal
2307    * @return whether WAL write was successful
2308    */
2309   private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
2310     if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
2311       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
2312         getRegionInfo(), -1, new TreeMap<byte[], List<Path>>());
2313       try {
2314         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2315           desc, sequenceId, true);
2316         return true;
2317       } catch (IOException e) {
2318         LOG.warn(getRegionInfo().getEncodedName() + " : "
2319             + "Received exception while trying to write the flush request to wal", e);
2320       }
2321     }
2322     return false;
2323   }
2324 
2325   protected FlushResult internalFlushCacheAndCommit(
2326         final WAL wal, MonitoredTask status, final PrepareFlushResult prepareResult,
2327         final Collection<Store> storesToFlush)
2328     throws IOException {
2329 
2330     // prepare flush context is carried via PrepareFlushResult
2331     TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
2332     TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
2333     long startTime = prepareResult.startTime;
2334     long flushOpSeqId = prepareResult.flushOpSeqId;
2335     long flushedSeqId = prepareResult.flushedSeqId;
2336     long totalFlushableSizeOfFlushableStores = prepareResult.totalFlushableSize;
2337 
2338     String s = "Flushing stores of " + this;
2339     status.setStatus(s);
2340     if (LOG.isTraceEnabled()) LOG.trace(s);
2341 
2342     // Any failure from here on out will be catastrophic requiring server
2343     // restart so wal content can be replayed and put back into the memstore.
2344     // Otherwise, the snapshot content while backed up in the wal, it will not
2345     // be part of the current running servers state.
2346     boolean compactionRequested = false;
2347     try {
2348       // A.  Flush memstore to all the HStores.
2349       // Keep running vector of all store files that includes both old and the
2350       // just-made new flush store file. The new flushed file is still in the
2351       // tmp directory.
2352 
2353       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2354         flush.flushCache(status);
2355       }
2356 
2357       // Switch snapshot (in memstore) -> new hfile (thus causing
2358       // all the store scanners to reset/reseek).
2359       Iterator<Store> it = storesToFlush.iterator();
2360       // stores.values() and storeFlushCtxs have same order
2361       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2362         boolean needsCompaction = flush.commit(status);
2363         if (needsCompaction) {
2364           compactionRequested = true;
2365         }
2366         committedFiles.put(it.next().getFamily().getName(), flush.getCommittedFiles());
2367       }
2368       storeFlushCtxs.clear();
2369 
2370       // Set down the memstore size by amount of flush.
2371       this.addAndGetGlobalMemstoreSize(-totalFlushableSizeOfFlushableStores);
2372 
2373       if (wal != null) {
2374         // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
2375         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
2376           getRegionInfo(), flushOpSeqId, committedFiles);
2377         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2378           desc, sequenceId, true);
2379       }
2380     } catch (Throwable t) {
2381       // An exception here means that the snapshot was not persisted.
2382       // The wal needs to be replayed so its content is restored to memstore.
2383       // Currently, only a server restart will do this.
2384       // We used to only catch IOEs but its possible that we'd get other
2385       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
2386       // all and sundry.
2387       if (wal != null) {
2388         try {
2389           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2390             getRegionInfo(), flushOpSeqId, committedFiles);
2391           WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2392             desc, sequenceId, false);
2393         } catch (Throwable ex) {
2394           LOG.warn(getRegionInfo().getEncodedName() + " : "
2395               + "Received unexpected exception trying to write ABORT_FLUSH marker to WAL:"
2396               + StringUtils.stringifyException(ex));
2397           // ignore this since we will be aborting the RS with DSE.
2398         }
2399         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2400       }
2401       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
2402           Bytes.toStringBinary(getRegionName()));
2403       dse.initCause(t);
2404       status.abort("Flush failed: " + StringUtils.stringifyException(t));
2405       throw dse;
2406     }
2407 
2408     // If we get to here, the HStores have been written.
2409     if (wal != null) {
2410       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2411     }
2412 
2413     // Record latest flush time
2414     for (Store store: storesToFlush) {
2415       this.lastStoreFlushTimeMap.put(store, startTime);
2416     }
2417 
2418     // Update the oldest unflushed sequence id for region.
2419     this.maxFlushedSeqId = flushedSeqId;
2420 
2421     // Record flush operation sequence id.
2422     this.lastFlushOpSeqId = flushOpSeqId;
2423 
2424     // C. Finally notify anyone waiting on memstore to clear:
2425     // e.g. checkResources().
2426     synchronized (this) {
2427       notifyAll(); // FindBugs NN_NAKED_NOTIFY
2428     }
2429 
2430     long time = EnvironmentEdgeManager.currentTime() - startTime;
2431     long memstoresize = this.memstoreSize.get();
2432     String msg = "Finished memstore flush of ~"
2433         + StringUtils.byteDesc(totalFlushableSizeOfFlushableStores) + "/"
2434         + totalFlushableSizeOfFlushableStores + ", currentsize="
2435         + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
2436         + " for region " + this + " in " + time + "ms, sequenceid="
2437         + flushOpSeqId +  ", compaction requested=" + compactionRequested
2438         + ((wal == null) ? "; wal=null" : "");
2439     LOG.info(msg);
2440     status.setStatus(msg);
2441 
2442     return new FlushResult(compactionRequested ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
2443         FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
2444   }
2445 
2446   /**
2447    * Method to safely get the next sequence number.
2448    * @return Next sequence number unassociated with any actual edit.
2449    * @throws IOException
2450    */
2451   @VisibleForTesting
2452   protected long getNextSequenceId(final WAL wal) throws IOException {
2453     WALKey key = this.appendEmptyEdit(wal, null);
2454     return key.getSequenceId();
2455   }
2456 
2457   //////////////////////////////////////////////////////////////////////////////
2458   // get() methods for client use.
2459   //////////////////////////////////////////////////////////////////////////////
2460   /**
2461    * Return all the data for the row that matches <i>row</i> exactly,
2462    * or the one that immediately preceeds it, at or immediately before
2463    * <i>ts</i>.
2464    *
2465    * @param row row key
2466    * @return map of values
2467    * @throws IOException
2468    */
2469   Result getClosestRowBefore(final byte [] row)
2470   throws IOException{
2471     return getClosestRowBefore(row, HConstants.CATALOG_FAMILY);
2472   }
2473 
2474   /**
2475    * Return all the data for the row that matches <i>row</i> exactly,
2476    * or the one that immediately precedes it, at or immediately before
2477    * <i>ts</i>.
2478    *
2479    * @param row row key
2480    * @param family column family to find on
2481    * @return map of values
2482    * @throws IOException read exceptions
2483    */
2484   public Result getClosestRowBefore(final byte [] row, final byte [] family)
2485   throws IOException {
2486     if (coprocessorHost != null) {
2487       Result result = new Result();
2488       if (coprocessorHost.preGetClosestRowBefore(row, family, result)) {
2489         return result;
2490       }
2491     }
2492     // look across all the HStores for this region and determine what the
2493     // closest key is across all column families, since the data may be sparse
2494     checkRow(row, "getClosestRowBefore");
2495     startRegionOperation(Operation.GET);
2496     this.readRequestsCount.increment();
2497     try {
2498       Store store = getStore(family);
2499       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
2500       Cell key = store.getRowKeyAtOrBefore(row);
2501       Result result = null;
2502       if (key != null) {
2503         Get get = new Get(CellUtil.cloneRow(key));
2504         get.addFamily(family);
2505         result = get(get);
2506       }
2507       if (coprocessorHost != null) {
2508         coprocessorHost.postGetClosestRowBefore(row, family, result);
2509       }
2510       return result;
2511     } finally {
2512       closeRegionOperation(Operation.GET);
2513     }
2514   }
2515 
2516   /**
2517    * Return an iterator that scans over the HRegion, returning the indicated
2518    * columns and rows specified by the {@link Scan}.
2519    * <p>
2520    * This Iterator must be closed by the caller.
2521    *
2522    * @param scan configured {@link Scan}
2523    * @return RegionScanner
2524    * @throws IOException read exceptions
2525    */
2526   public RegionScanner getScanner(Scan scan) throws IOException {
2527    return getScanner(scan, null);
2528   }
2529 
2530   void prepareScanner(Scan scan) {
2531     if(!scan.hasFamilies()) {
2532       // Adding all families to scanner
2533       for(byte[] family: this.htableDescriptor.getFamiliesKeys()){
2534         scan.addFamily(family);
2535       }
2536     }
2537   }
2538 
2539   protected RegionScanner getScanner(Scan scan,
2540       List<KeyValueScanner> additionalScanners) throws IOException {
2541     startRegionOperation(Operation.SCAN);
2542     try {
2543       // Verify families are all valid
2544       prepareScanner(scan);
2545       if(scan.hasFamilies()) {
2546         for(byte [] family : scan.getFamilyMap().keySet()) {
2547           checkFamily(family);
2548         }
2549       }
2550       return instantiateRegionScanner(scan, additionalScanners);
2551     } finally {
2552       closeRegionOperation(Operation.SCAN);
2553     }
2554   }
2555 
2556   protected RegionScanner instantiateRegionScanner(Scan scan,
2557       List<KeyValueScanner> additionalScanners) throws IOException {
2558     if (scan.isReversed()) {
2559       if (scan.getFilter() != null) {
2560         scan.getFilter().setReversed(true);
2561       }
2562       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
2563     }
2564     return new RegionScannerImpl(scan, additionalScanners, this);
2565   }
2566 
2567   /*
2568    * @param delete The passed delete is modified by this method. WARNING!
2569    */
2570   void prepareDelete(Delete delete) throws IOException {
2571     // Check to see if this is a deleteRow insert
2572     if(delete.getFamilyCellMap().isEmpty()){
2573       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
2574         // Don't eat the timestamp
2575         delete.addFamily(family, delete.getTimeStamp());
2576       }
2577     } else {
2578       for(byte [] family : delete.getFamilyCellMap().keySet()) {
2579         if(family == null) {
2580           throw new NoSuchColumnFamilyException("Empty family is invalid");
2581         }
2582         checkFamily(family);
2583       }
2584     }
2585   }
2586 
2587   //////////////////////////////////////////////////////////////////////////////
2588   // set() methods for client use.
2589   //////////////////////////////////////////////////////////////////////////////
2590   /**
2591    * @param delete delete object
2592    * @throws IOException read exceptions
2593    */
2594   public void delete(Delete delete)
2595   throws IOException {
2596     checkReadOnly();
2597     checkResources();
2598     startRegionOperation(Operation.DELETE);
2599     try {
2600       delete.getRow();
2601       // All edits for the given row (across all column families) must happen atomically.
2602       doBatchMutate(delete);
2603     } finally {
2604       closeRegionOperation(Operation.DELETE);
2605     }
2606   }
2607 
2608   /**
2609    * Row needed by below method.
2610    */
2611   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2612   /**
2613    * This is used only by unit tests. Not required to be a public API.
2614    * @param familyMap map of family to edits for the given family.
2615    * @throws IOException
2616    */
2617   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2618       Durability durability) throws IOException {
2619     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2620     delete.setFamilyCellMap(familyMap);
2621     delete.setDurability(durability);
2622     doBatchMutate(delete);
2623   }
2624 
2625   /**
2626    * Setup correct timestamps in the KVs in Delete object.
2627    * Caller should have the row and region locks.
2628    * @param mutation
2629    * @param familyMap
2630    * @param byteNow
2631    * @throws IOException
2632    */
2633   void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2634       byte[] byteNow) throws IOException {
2635     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2636 
2637       byte[] family = e.getKey();
2638       List<Cell> cells = e.getValue();
2639       assert cells instanceof RandomAccess;
2640 
2641       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2642       int listSize = cells.size();
2643       for (int i=0; i < listSize; i++) {
2644         Cell cell = cells.get(i);
2645         //  Check if time is LATEST, change to time of most recent addition if so
2646         //  This is expensive.
2647         if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && CellUtil.isDeleteType(cell)) {
2648           byte[] qual = CellUtil.cloneQualifier(cell);
2649           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2650 
2651           Integer count = kvCount.get(qual);
2652           if (count == null) {
2653             kvCount.put(qual, 1);
2654           } else {
2655             kvCount.put(qual, count + 1);
2656           }
2657           count = kvCount.get(qual);
2658 
2659           Get get = new Get(CellUtil.cloneRow(cell));
2660           get.setMaxVersions(count);
2661           get.addColumn(family, qual);
2662           if (coprocessorHost != null) {
2663             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2664                 byteNow, get)) {
2665               updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2666             }
2667           } else {
2668             updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2669           }
2670         } else {
2671           CellUtil.updateLatestStamp(cell, byteNow, 0);
2672         }
2673       }
2674     }
2675   }
2676 
2677   void updateDeleteLatestVersionTimeStamp(Cell cell, Get get, int count, byte[] byteNow)
2678       throws IOException {
2679     List<Cell> result = get(get, false);
2680 
2681     if (result.size() < count) {
2682       // Nothing to delete
2683       CellUtil.updateLatestStamp(cell, byteNow, 0);
2684       return;
2685     }
2686     if (result.size() > count) {
2687       throw new RuntimeException("Unexpected size: " + result.size());
2688     }
2689     Cell getCell = result.get(count - 1);
2690     CellUtil.setTimestamp(cell, getCell.getTimestamp());
2691   }
2692 
2693   /**
2694    * @throws IOException
2695    */
2696   public void put(Put put)
2697   throws IOException {
2698     checkReadOnly();
2699 
2700     // Do a rough check that we have resources to accept a write.  The check is
2701     // 'rough' in that between the resource check and the call to obtain a
2702     // read lock, resources may run out.  For now, the thought is that this
2703     // will be extremely rare; we'll deal with it when it happens.
2704     checkResources();
2705     startRegionOperation(Operation.PUT);
2706     try {
2707       // All edits for the given row (across all column families) must happen atomically.
2708       doBatchMutate(put);
2709     } finally {
2710       closeRegionOperation(Operation.PUT);
2711     }
2712   }
2713 
2714   /**
2715    * Struct-like class that tracks the progress of a batch operation,
2716    * accumulating status codes and tracking the index at which processing
2717    * is proceeding.
2718    */
2719   private abstract static class BatchOperationInProgress<T> {
2720     T[] operations;
2721     int nextIndexToProcess = 0;
2722     OperationStatus[] retCodeDetails;
2723     WALEdit[] walEditsFromCoprocessors;
2724 
2725     public BatchOperationInProgress(T[] operations) {
2726       this.operations = operations;
2727       this.retCodeDetails = new OperationStatus[operations.length];
2728       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2729       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2730     }
2731 
2732     public abstract Mutation getMutation(int index);
2733     public abstract long getNonceGroup(int index);
2734     public abstract long getNonce(int index);
2735     /** This method is potentially expensive and should only be used for non-replay CP path. */
2736     public abstract Mutation[] getMutationsForCoprocs();
2737     public abstract boolean isInReplay();
2738     public abstract long getReplaySequenceId();
2739 
2740     public boolean isDone() {
2741       return nextIndexToProcess == operations.length;
2742     }
2743   }
2744 
2745   private static class MutationBatch extends BatchOperationInProgress<Mutation> {
2746     private long nonceGroup;
2747     private long nonce;
2748     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2749       super(operations);
2750       this.nonceGroup = nonceGroup;
2751       this.nonce = nonce;
2752     }
2753 
2754     @Override
2755     public Mutation getMutation(int index) {
2756       return this.operations[index];
2757     }
2758 
2759     @Override
2760     public long getNonceGroup(int index) {
2761       return nonceGroup;
2762     }
2763 
2764     @Override
2765     public long getNonce(int index) {
2766       return nonce;
2767     }
2768 
2769     @Override
2770     public Mutation[] getMutationsForCoprocs() {
2771       return this.operations;
2772     }
2773 
2774     @Override
2775     public boolean isInReplay() {
2776       return false;
2777     }
2778 
2779     @Override
2780     public long getReplaySequenceId() {
2781       return 0;
2782     }
2783   }
2784 
2785   private static class ReplayBatch extends BatchOperationInProgress<MutationReplay> {
2786     private long replaySeqId = 0;
2787     public ReplayBatch(MutationReplay[] operations, long seqId) {
2788       super(operations);
2789       this.replaySeqId = seqId;
2790     }
2791 
2792     @Override
2793     public Mutation getMutation(int index) {
2794       return this.operations[index].mutation;
2795     }
2796 
2797     @Override
2798     public long getNonceGroup(int index) {
2799       return this.operations[index].nonceGroup;
2800     }
2801 
2802     @Override
2803     public long getNonce(int index) {
2804       return this.operations[index].nonce;
2805     }
2806 
2807     @Override
2808     public Mutation[] getMutationsForCoprocs() {
2809       assert false;
2810       throw new RuntimeException("Should not be called for replay batch");
2811     }
2812 
2813     @Override
2814     public boolean isInReplay() {
2815       return true;
2816     }
2817 
2818     @Override
2819     public long getReplaySequenceId() {
2820       return this.replaySeqId;
2821     }
2822   }
2823 
2824   /**
2825    * Perform a batch of mutations.
2826    * It supports only Put and Delete mutations and will ignore other types passed.
2827    * @param mutations the list of mutations
2828    * @return an array of OperationStatus which internally contains the
2829    *         OperationStatusCode and the exceptionMessage if any.
2830    * @throws IOException
2831    */
2832   public OperationStatus[] batchMutate(
2833       Mutation[] mutations, long nonceGroup, long nonce) throws IOException {
2834     // As it stands, this is used for 3 things
2835     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2836     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2837     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2838     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2839   }
2840 
2841   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2842     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2843   }
2844 
2845   /**
2846    * Replay a batch of mutations.
2847    * @param mutations mutations to replay.
2848    * @param replaySeqId SeqId for current mutations
2849    * @return an array of OperationStatus which internally contains the
2850    *         OperationStatusCode and the exceptionMessage if any.
2851    * @throws IOException
2852    */
2853   public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
2854       throws IOException {
2855     if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
2856         && replaySeqId < lastReplayedOpenRegionSeqId) {
2857       // if it is a secondary replica we should ignore these entries silently
2858       // since they are coming out of order
2859       if (LOG.isTraceEnabled()) {
2860         LOG.trace(getRegionInfo().getEncodedName() + " : "
2861           + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
2862           + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
2863         for (MutationReplay mut : mutations) {
2864           LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
2865         }
2866       }
2867 
2868       OperationStatus[] statuses = new OperationStatus[mutations.length];
2869       for (int i = 0; i < statuses.length; i++) {
2870         statuses[i] = OperationStatus.SUCCESS;
2871       }
2872       return statuses;
2873     }
2874     return batchMutate(new ReplayBatch(mutations, replaySeqId));
2875   }
2876 
2877   /**
2878    * Perform a batch of mutations.
2879    * It supports only Put and Delete mutations and will ignore other types passed.
2880    * @param batchOp contains the list of mutations
2881    * @return an array of OperationStatus which internally contains the
2882    *         OperationStatusCode and the exceptionMessage if any.
2883    * @throws IOException
2884    */
2885   OperationStatus[] batchMutate(BatchOperationInProgress<?> batchOp) throws IOException {
2886     boolean initialized = false;
2887     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
2888     startRegionOperation(op);
2889     try {
2890       while (!batchOp.isDone()) {
2891         if (!batchOp.isInReplay()) {
2892           checkReadOnly();
2893         }
2894         checkResources();
2895 
2896         if (!initialized) {
2897           this.writeRequestsCount.add(batchOp.operations.length);
2898           if (!batchOp.isInReplay()) {
2899             doPreMutationHook(batchOp);
2900           }
2901           initialized = true;
2902         }
2903         long addedSize = doMiniBatchMutation(batchOp);
2904         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
2905         if (isFlushSize(newSize)) {
2906           requestFlush();
2907         }
2908       }
2909     } finally {
2910       closeRegionOperation(op);
2911     }
2912     return batchOp.retCodeDetails;
2913   }
2914 
2915 
2916   private void doPreMutationHook(BatchOperationInProgress<?> batchOp)
2917       throws IOException {
2918     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2919     WALEdit walEdit = new WALEdit();
2920     if (coprocessorHost != null) {
2921       for (int i = 0 ; i < batchOp.operations.length; i++) {
2922         Mutation m = batchOp.getMutation(i);
2923         if (m instanceof Put) {
2924           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2925             // pre hook says skip this Put
2926             // mark as success and skip in doMiniBatchMutation
2927             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2928           }
2929         } else if (m instanceof Delete) {
2930           Delete curDel = (Delete) m;
2931           if (curDel.getFamilyCellMap().isEmpty()) {
2932             // handle deleting a row case
2933             prepareDelete(curDel);
2934           }
2935           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2936             // pre hook says skip this Delete
2937             // mark as success and skip in doMiniBatchMutation
2938             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2939           }
2940         } else {
2941           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2942           // mark the operation return code as failure so that it will not be considered in
2943           // the doMiniBatchMutation
2944           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2945               "Put/Delete mutations only supported in batchMutate() now");
2946         }
2947         if (!walEdit.isEmpty()) {
2948           batchOp.walEditsFromCoprocessors[i] = walEdit;
2949           walEdit = new WALEdit();
2950         }
2951       }
2952     }
2953   }
2954 
2955   @SuppressWarnings("unchecked")
2956   private long doMiniBatchMutation(BatchOperationInProgress<?> batchOp) throws IOException {
2957     boolean isInReplay = batchOp.isInReplay();
2958     // variable to note if all Put items are for the same CF -- metrics related
2959     boolean putsCfSetConsistent = true;
2960     //The set of columnFamilies first seen for Put.
2961     Set<byte[]> putsCfSet = null;
2962     // variable to note if all Delete items are for the same CF -- metrics related
2963     boolean deletesCfSetConsistent = true;
2964     //The set of columnFamilies first seen for Delete.
2965     Set<byte[]> deletesCfSet = null;
2966 
2967     long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE;
2968     WALEdit walEdit = new WALEdit(isInReplay);
2969     MultiVersionConsistencyControl.WriteEntry w = null;
2970     long txid = 0;
2971     boolean doRollBackMemstore = false;
2972     boolean locked = false;
2973 
2974     /** Keep track of the locks we hold so we can release them in finally clause */
2975     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
2976     // reference family maps directly so coprocessors can mutate them if desired
2977     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
2978     List<Cell> memstoreCells = new ArrayList<Cell>();
2979     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
2980     int firstIndex = batchOp.nextIndexToProcess;
2981     int lastIndexExclusive = firstIndex;
2982     boolean success = false;
2983     int noOfPuts = 0, noOfDeletes = 0;
2984     WALKey walKey = null;
2985     long mvccNum = 0;
2986     try {
2987       // ------------------------------------
2988       // STEP 1. Try to acquire as many locks as we can, and ensure
2989       // we acquire at least one.
2990       // ----------------------------------
2991       int numReadyToWrite = 0;
2992       long now = EnvironmentEdgeManager.currentTime();
2993       while (lastIndexExclusive < batchOp.operations.length) {
2994         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
2995         boolean isPutMutation = mutation instanceof Put;
2996 
2997         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
2998         // store the family map reference to allow for mutations
2999         familyMaps[lastIndexExclusive] = familyMap;
3000 
3001         // skip anything that "ran" already
3002         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
3003             != OperationStatusCode.NOT_RUN) {
3004           lastIndexExclusive++;
3005           continue;
3006         }
3007 
3008         try {
3009           if (isPutMutation) {
3010             // Check the families in the put. If bad, skip this one.
3011             if (isInReplay) {
3012               removeNonExistentColumnFamilyForReplay(familyMap);
3013             } else {
3014               checkFamilies(familyMap.keySet());
3015             }
3016             checkTimestamps(mutation.getFamilyCellMap(), now);
3017           } else {
3018             prepareDelete((Delete) mutation);
3019           }
3020         } catch (NoSuchColumnFamilyException nscf) {
3021           LOG.warn("No such column family in batch mutation", nscf);
3022           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3023               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
3024           lastIndexExclusive++;
3025           continue;
3026         } catch (FailedSanityCheckException fsce) {
3027           LOG.warn("Batch Mutation did not pass sanity check", fsce);
3028           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3029               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
3030           lastIndexExclusive++;
3031           continue;
3032         }
3033 
3034         // If we haven't got any rows in our batch, we should block to
3035         // get the next one.
3036         boolean shouldBlock = numReadyToWrite == 0;
3037         RowLock rowLock = null;
3038         try {
3039           rowLock = getRowLockInternal(mutation.getRow(), shouldBlock);
3040         } catch (IOException ioe) {
3041           LOG.warn("Failed getting lock in batch put, row="
3042             + Bytes.toStringBinary(mutation.getRow()), ioe);
3043         }
3044         if (rowLock == null) {
3045           // We failed to grab another lock
3046           assert !shouldBlock : "Should never fail to get lock when blocking";
3047           break; // stop acquiring more rows for this batch
3048         } else {
3049           acquiredRowLocks.add(rowLock);
3050         }
3051 
3052         lastIndexExclusive++;
3053         numReadyToWrite++;
3054 
3055         if (isPutMutation) {
3056           // If Column Families stay consistent through out all of the
3057           // individual puts then metrics can be reported as a mutliput across
3058           // column families in the first put.
3059           if (putsCfSet == null) {
3060             putsCfSet = mutation.getFamilyCellMap().keySet();
3061           } else {
3062             putsCfSetConsistent = putsCfSetConsistent
3063                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
3064           }
3065         } else {
3066           if (deletesCfSet == null) {
3067             deletesCfSet = mutation.getFamilyCellMap().keySet();
3068           } else {
3069             deletesCfSetConsistent = deletesCfSetConsistent
3070                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
3071           }
3072         }
3073       }
3074 
3075       // we should record the timestamp only after we have acquired the rowLock,
3076       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
3077       now = EnvironmentEdgeManager.currentTime();
3078       byte[] byteNow = Bytes.toBytes(now);
3079 
3080       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
3081       if (numReadyToWrite <= 0) return 0L;
3082 
3083       // We've now grabbed as many mutations off the list as we can
3084 
3085       // ------------------------------------
3086       // STEP 2. Update any LATEST_TIMESTAMP timestamps
3087       // ----------------------------------
3088       for (int i = firstIndex; !isInReplay && i < lastIndexExclusive; i++) {
3089         // skip invalid
3090         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3091             != OperationStatusCode.NOT_RUN) continue;
3092 
3093         Mutation mutation = batchOp.getMutation(i);
3094         if (mutation instanceof Put) {
3095           updateCellTimestamps(familyMaps[i].values(), byteNow);
3096           noOfPuts++;
3097         } else {
3098           prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
3099           noOfDeletes++;
3100         }
3101         rewriteCellTags(familyMaps[i], mutation);
3102       }
3103 
3104       lock(this.updatesLock.readLock(), numReadyToWrite);
3105       locked = true;
3106       if(isInReplay) {
3107         mvccNum = batchOp.getReplaySequenceId();
3108       } else {
3109         mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
3110       }
3111       //
3112       // ------------------------------------
3113       // Acquire the latest mvcc number
3114       // ----------------------------------
3115       w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
3116 
3117       // calling the pre CP hook for batch mutation
3118       if (!isInReplay && coprocessorHost != null) {
3119         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3120           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3121           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3122         if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L;
3123       }
3124 
3125       // ------------------------------------
3126       // STEP 3. Write back to memstore
3127       // Write to memstore. It is ok to write to memstore
3128       // first without updating the WAL because we do not roll
3129       // forward the memstore MVCC. The MVCC will be moved up when
3130       // the complete operation is done. These changes are not yet
3131       // visible to scanners till we update the MVCC. The MVCC is
3132       // moved only when the sync is complete.
3133       // ----------------------------------
3134       long addedSize = 0;
3135       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3136         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3137             != OperationStatusCode.NOT_RUN) {
3138           continue;
3139         }
3140         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
3141         addedSize += applyFamilyMapToMemstore(familyMaps[i], mvccNum, memstoreCells, isInReplay);
3142       }
3143 
3144       // ------------------------------------
3145       // STEP 4. Build WAL edit
3146       // ----------------------------------
3147       Durability durability = Durability.USE_DEFAULT;
3148       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3149         // Skip puts that were determined to be invalid during preprocessing
3150         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3151             != OperationStatusCode.NOT_RUN) {
3152           continue;
3153         }
3154         batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3155 
3156         Mutation m = batchOp.getMutation(i);
3157         Durability tmpDur = getEffectiveDurability(m.getDurability());
3158         if (tmpDur.ordinal() > durability.ordinal()) {
3159           durability = tmpDur;
3160         }
3161         if (tmpDur == Durability.SKIP_WAL) {
3162           recordMutationWithoutWal(m.getFamilyCellMap());
3163           continue;
3164         }
3165 
3166         long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i);
3167         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
3168         // Given how nonces are originally written, these should be contiguous.
3169         // They don't have to be, it will still work, just write more WALEdits than needed.
3170         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
3171           if (walEdit.size() > 0) {
3172             assert isInReplay;
3173             if (!isInReplay) {
3174               throw new IOException("Multiple nonces per batch and not in replay");
3175             }
3176             // txid should always increase, so having the one from the last call is ok.
3177             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3178             walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3179               this.htableDescriptor.getTableName(), now, m.getClusterIds(),
3180               currentNonceGroup, currentNonce);
3181             txid = this.wal.append(this.htableDescriptor,  this.getRegionInfo(),  walKey,
3182               walEdit, getSequenceId(), true, null);
3183             walEdit = new WALEdit(isInReplay);
3184             walKey = null;
3185           }
3186           currentNonceGroup = nonceGroup;
3187           currentNonce = nonce;
3188         }
3189 
3190         // Add WAL edits by CP
3191         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3192         if (fromCP != null) {
3193           for (Cell cell : fromCP.getCells()) {
3194             walEdit.add(cell);
3195           }
3196         }
3197         addFamilyMapToWALEdit(familyMaps[i], walEdit);
3198       }
3199 
3200       // -------------------------
3201       // STEP 5. Append the final edit to WAL. Do not sync wal.
3202       // -------------------------
3203       Mutation mutation = batchOp.getMutation(firstIndex);
3204       if (isInReplay) {
3205         // use wal key from the original
3206         walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3207           this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3208           mutation.getClusterIds(), currentNonceGroup, currentNonce);
3209         long replaySeqId = batchOp.getReplaySequenceId();
3210         walKey.setOrigLogSeqNum(replaySeqId);
3211 
3212         // ensure that the sequence id of the region is at least as big as orig log seq id
3213         while (true) {
3214           long seqId = getSequenceId().get();
3215           if (seqId >= replaySeqId) break;
3216           if (getSequenceId().compareAndSet(seqId, replaySeqId)) break;
3217         }
3218       }
3219       if (walEdit.size() > 0) {
3220         if (!isInReplay) {
3221         // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3222         walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3223             this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3224             mutation.getClusterIds(), currentNonceGroup, currentNonce);
3225         }
3226 
3227         txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit,
3228           getSequenceId(), true, memstoreCells);
3229       }
3230       if(walKey == null){
3231         // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
3232         walKey = this.appendEmptyEdit(this.wal, memstoreCells);
3233       }
3234 
3235       // -------------------------------
3236       // STEP 6. Release row locks, etc.
3237       // -------------------------------
3238       if (locked) {
3239         this.updatesLock.readLock().unlock();
3240         locked = false;
3241       }
3242       releaseRowLocks(acquiredRowLocks);
3243 
3244       // -------------------------
3245       // STEP 7. Sync wal.
3246       // -------------------------
3247       if (txid != 0) {
3248         syncOrDefer(txid, durability);
3249       }
3250 
3251       doRollBackMemstore = false;
3252       // calling the post CP hook for batch mutation
3253       if (!isInReplay && coprocessorHost != null) {
3254         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3255           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3256           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3257         coprocessorHost.postBatchMutate(miniBatchOp);
3258       }
3259 
3260 
3261       // ------------------------------------------------------------------
3262       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
3263       // ------------------------------------------------------------------
3264       if (w != null) {
3265         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
3266         w = null;
3267       }
3268 
3269       // ------------------------------------
3270       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
3271       // synced so that the coprocessor contract is adhered to.
3272       // ------------------------------------
3273       if (!isInReplay && coprocessorHost != null) {
3274         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3275           // only for successful puts
3276           if (batchOp.retCodeDetails[i].getOperationStatusCode()
3277               != OperationStatusCode.SUCCESS) {
3278             continue;
3279           }
3280           Mutation m = batchOp.getMutation(i);
3281           if (m instanceof Put) {
3282             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
3283           } else {
3284             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
3285           }
3286         }
3287       }
3288 
3289       success = true;
3290       return addedSize;
3291     } finally {
3292       // if the wal sync was unsuccessful, remove keys from memstore
3293       if (doRollBackMemstore) {
3294         rollbackMemstore(memstoreCells);
3295       }
3296       if (w != null) {
3297         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
3298       }
3299 
3300       if (locked) {
3301         this.updatesLock.readLock().unlock();
3302       }
3303       releaseRowLocks(acquiredRowLocks);
3304 
3305       // See if the column families were consistent through the whole thing.
3306       // if they were then keep them. If they were not then pass a null.
3307       // null will be treated as unknown.
3308       // Total time taken might be involving Puts and Deletes.
3309       // Split the time for puts and deletes based on the total number of Puts and Deletes.
3310 
3311       if (noOfPuts > 0) {
3312         // There were some Puts in the batch.
3313         if (this.metricsRegion != null) {
3314           this.metricsRegion.updatePut();
3315         }
3316       }
3317       if (noOfDeletes > 0) {
3318         // There were some Deletes in the batch.
3319         if (this.metricsRegion != null) {
3320           this.metricsRegion.updateDelete();
3321         }
3322       }
3323       if (!success) {
3324         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3325           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
3326             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
3327           }
3328         }
3329       }
3330       if (coprocessorHost != null && !batchOp.isInReplay()) {
3331         // call the coprocessor hook to do any finalization steps
3332         // after the put is done
3333         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3334             new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3335                 batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex,
3336                 lastIndexExclusive);
3337         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
3338       }
3339 
3340       batchOp.nextIndexToProcess = lastIndexExclusive;
3341     }
3342   }
3343 
3344   /**
3345    * Returns effective durability from the passed durability and
3346    * the table descriptor.
3347    */
3348   protected Durability getEffectiveDurability(Durability d) {
3349     return d == Durability.USE_DEFAULT ? this.durability : d;
3350   }
3351 
3352   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3353   //the getting of the lock happens before, so that you would just pass it into
3354   //the methods. So in the case of checkAndMutate you could just do lockRow,
3355   //get, put, unlockRow or something
3356   /**
3357    *
3358    * @throws IOException
3359    * @return true if the new put was executed, false otherwise
3360    */
3361   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
3362       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
3363       boolean writeToWAL)
3364   throws IOException{
3365     checkReadOnly();
3366     //TODO, add check for value length or maybe even better move this to the
3367     //client if this becomes a global setting
3368     checkResources();
3369     boolean isPut = w instanceof Put;
3370     if (!isPut && !(w instanceof Delete))
3371       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
3372           "be Put or Delete");
3373     if (!Bytes.equals(row, w.getRow())) {
3374       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
3375           "getRow must match the passed row");
3376     }
3377 
3378     startRegionOperation();
3379     try {
3380       Get get = new Get(row);
3381       checkFamily(family);
3382       get.addColumn(family, qualifier);
3383 
3384       // Lock row - note that doBatchMutate will relock this row if called
3385       RowLock rowLock = getRowLock(get.getRow());
3386       // wait for all previous transactions to complete (with lock held)
3387       mvcc.waitForPreviousTransactionsComplete();
3388       try {
3389         if (this.getCoprocessorHost() != null) {
3390           Boolean processed = null;
3391           if (w instanceof Put) {
3392             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
3393                 qualifier, compareOp, comparator, (Put) w);
3394           } else if (w instanceof Delete) {
3395             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
3396                 qualifier, compareOp, comparator, (Delete) w);
3397           }
3398           if (processed != null) {
3399             return processed;
3400           }
3401         }
3402         List<Cell> result = get(get, false);
3403 
3404         boolean valueIsNull = comparator.getValue() == null ||
3405           comparator.getValue().length == 0;
3406         boolean matches = false;
3407         if (result.size() == 0 && valueIsNull) {
3408           matches = true;
3409         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3410             valueIsNull) {
3411           matches = true;
3412         } else if (result.size() == 1 && !valueIsNull) {
3413           Cell kv = result.get(0);
3414           int compareResult = comparator.compareTo(kv.getValueArray(),
3415               kv.getValueOffset(), kv.getValueLength());
3416           switch (compareOp) {
3417           case LESS:
3418             matches = compareResult < 0;
3419             break;
3420           case LESS_OR_EQUAL:
3421             matches = compareResult <= 0;
3422             break;
3423           case EQUAL:
3424             matches = compareResult == 0;
3425             break;
3426           case NOT_EQUAL:
3427             matches = compareResult != 0;
3428             break;
3429           case GREATER_OR_EQUAL:
3430             matches = compareResult >= 0;
3431             break;
3432           case GREATER:
3433             matches = compareResult > 0;
3434             break;
3435           default:
3436             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3437           }
3438         }
3439         //If matches put the new put or delete the new delete
3440         if (matches) {
3441           // All edits for the given row (across all column families) must
3442           // happen atomically.
3443           doBatchMutate(w);
3444           this.checkAndMutateChecksPassed.increment();
3445           return true;
3446         }
3447         this.checkAndMutateChecksFailed.increment();
3448         return false;
3449       } finally {
3450         rowLock.release();
3451       }
3452     } finally {
3453       closeRegionOperation();
3454     }
3455   }
3456 
3457   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3458   //the getting of the lock happens before, so that you would just pass it into
3459   //the methods. So in the case of checkAndMutate you could just do lockRow,
3460   //get, put, unlockRow or something
3461   /**
3462    *
3463    * @throws IOException
3464    * @return true if the new put was executed, false otherwise
3465    */
3466   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3467       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
3468       boolean writeToWAL)
3469       throws IOException{
3470     checkReadOnly();
3471     //TODO, add check for value length or maybe even better move this to the
3472     //client if this becomes a global setting
3473     checkResources();
3474 
3475     startRegionOperation();
3476     try {
3477       Get get = new Get(row);
3478       checkFamily(family);
3479       get.addColumn(family, qualifier);
3480 
3481       // Lock row - note that doBatchMutate will relock this row if called
3482       RowLock rowLock = getRowLock(get.getRow());
3483       // wait for all previous transactions to complete (with lock held)
3484       mvcc.waitForPreviousTransactionsComplete();
3485       try {
3486         List<Cell> result = get(get, false);
3487 
3488         boolean valueIsNull = comparator.getValue() == null ||
3489             comparator.getValue().length == 0;
3490         boolean matches = false;
3491         if (result.size() == 0 && valueIsNull) {
3492           matches = true;
3493         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3494             valueIsNull) {
3495           matches = true;
3496         } else if (result.size() == 1 && !valueIsNull) {
3497           Cell kv = result.get(0);
3498           int compareResult = comparator.compareTo(kv.getValueArray(),
3499               kv.getValueOffset(), kv.getValueLength());
3500           switch (compareOp) {
3501           case LESS:
3502             matches = compareResult < 0;
3503             break;
3504           case LESS_OR_EQUAL:
3505             matches = compareResult <= 0;
3506             break;
3507           case EQUAL:
3508             matches = compareResult == 0;
3509             break;
3510           case NOT_EQUAL:
3511             matches = compareResult != 0;
3512             break;
3513           case GREATER_OR_EQUAL:
3514             matches = compareResult >= 0;
3515             break;
3516           case GREATER:
3517             matches = compareResult > 0;
3518             break;
3519           default:
3520             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3521           }
3522         }
3523         //If matches put the new put or delete the new delete
3524         if (matches) {
3525           // All edits for the given row (across all column families) must
3526           // happen atomically.
3527           mutateRow(rm);
3528           this.checkAndMutateChecksPassed.increment();
3529           return true;
3530         }
3531         this.checkAndMutateChecksFailed.increment();
3532         return false;
3533       } finally {
3534         rowLock.release();
3535       }
3536     } finally {
3537       closeRegionOperation();
3538     }
3539   }
3540   private void doBatchMutate(Mutation mutation) throws IOException {
3541     // Currently this is only called for puts and deletes, so no nonces.
3542     OperationStatus[] batchMutate = this.batchMutate(new Mutation[] { mutation },
3543         HConstants.NO_NONCE, HConstants.NO_NONCE);
3544     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
3545       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
3546     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
3547       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
3548     }
3549   }
3550 
3551   /**
3552    * Complete taking the snapshot on the region. Writes the region info and adds references to the
3553    * working snapshot directory.
3554    *
3555    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
3556    * arg.  (In the future other cancellable HRegion methods could eventually add a
3557    * {@link ForeignExceptionSnare}, or we could do something fancier).
3558    *
3559    * @param desc snapshot description object
3560    * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
3561    *   bail out.  This is allowed to be null and will just be ignored in that case.
3562    * @throws IOException if there is an external or internal error causing the snapshot to fail
3563    */
3564   public void addRegionToSnapshot(SnapshotDescription desc,
3565       ForeignExceptionSnare exnSnare) throws IOException {
3566     Path rootDir = FSUtils.getRootDir(conf);
3567     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
3568 
3569     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
3570                                                         snapshotDir, desc, exnSnare);
3571     manifest.addRegion(this);
3572   }
3573 
3574   /**
3575    * Replaces any KV timestamps set to {@link HConstants#LATEST_TIMESTAMP} with the
3576    * provided current timestamp.
3577    * @throws IOException
3578    */
3579   void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now)
3580       throws IOException {
3581     for (List<Cell> cells: cellItr) {
3582       if (cells == null) continue;
3583       assert cells instanceof RandomAccess;
3584       int listSize = cells.size();
3585       for (int i = 0; i < listSize; i++) {
3586         CellUtil.updateLatestStamp(cells.get(i), now, 0);
3587       }
3588     }
3589   }
3590 
3591   /**
3592    * Possibly rewrite incoming cell tags.
3593    */
3594   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
3595     // Check if we have any work to do and early out otherwise
3596     // Update these checks as more logic is added here
3597 
3598     if (m.getTTL() == Long.MAX_VALUE) {
3599       return;
3600     }
3601 
3602     // From this point we know we have some work to do
3603 
3604     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
3605       List<Cell> cells = e.getValue();
3606       assert cells instanceof RandomAccess;
3607       int listSize = cells.size();
3608       for (int i = 0; i < listSize; i++) {
3609         Cell cell = cells.get(i);
3610         List<Tag> newTags = new ArrayList<Tag>();
3611         Iterator<Tag> tagIterator = CellUtil.tagsIterator(cell.getTagsArray(),
3612           cell.getTagsOffset(), cell.getTagsLength());
3613 
3614         // Carry forward existing tags
3615 
3616         while (tagIterator.hasNext()) {
3617 
3618           // Add any filters or tag specific rewrites here
3619 
3620           newTags.add(tagIterator.next());
3621         }
3622 
3623         // Cell TTL handling
3624 
3625         // Check again if we need to add a cell TTL because early out logic
3626         // above may change when there are more tag based features in core.
3627         if (m.getTTL() != Long.MAX_VALUE) {
3628           // Add a cell TTL tag
3629           newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(m.getTTL())));
3630         }
3631 
3632         // Rewrite the cell with the updated set of tags
3633 
3634         cells.set(i, new KeyValue(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
3635           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
3636           cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength(),
3637           cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
3638           cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
3639           newTags));
3640       }
3641     }
3642   }
3643 
3644   /*
3645    * Check if resources to support an update.
3646    *
3647    * We throw RegionTooBusyException if above memstore limit
3648    * and expect client to retry using some kind of backoff
3649   */
3650   private void checkResources() throws RegionTooBusyException {
3651     // If catalog region, do not impose resource constraints or block updates.
3652     if (this.getRegionInfo().isMetaRegion()) return;
3653 
3654     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3655       blockedRequestsCount.increment();
3656       requestFlush();
3657       throw new RegionTooBusyException("Above memstore limit, " +
3658           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3659           this.getRegionInfo().getRegionNameAsString()) +
3660           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3661           this.getRegionServerServices().getServerName()) +
3662           ", memstoreSize=" + memstoreSize.get() +
3663           ", blockingMemStoreSize=" + blockingMemStoreSize);
3664     }
3665   }
3666 
3667   /**
3668    * @throws IOException Throws exception if region is in read-only mode.
3669    */
3670   protected void checkReadOnly() throws IOException {
3671     if (this.writestate.isReadOnly()) {
3672       throw new IOException("region is read only");
3673     }
3674   }
3675 
3676   protected void checkReadsEnabled() throws IOException {
3677     if (!this.writestate.readsEnabled) {
3678       throw new IOException(getRegionInfo().getEncodedName()
3679         + ": The region's reads are disabled. Cannot serve the request");
3680     }
3681   }
3682 
3683   public void setReadsEnabled(boolean readsEnabled) {
3684    if (readsEnabled && !this.writestate.readsEnabled) {
3685      LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
3686     }
3687     this.writestate.setReadsEnabled(readsEnabled);
3688   }
3689 
3690   /**
3691    * Add updates first to the wal and then add values to memstore.
3692    * Warning: Assumption is caller has lock on passed in row.
3693    * @param edits Cell updates by column
3694    * @throws IOException
3695    */
3696   private void put(final byte [] row, byte [] family, List<Cell> edits)
3697   throws IOException {
3698     NavigableMap<byte[], List<Cell>> familyMap;
3699     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3700 
3701     familyMap.put(family, edits);
3702     Put p = new Put(row);
3703     p.setFamilyCellMap(familyMap);
3704     doBatchMutate(p);
3705   }
3706 
3707   /**
3708    * Atomically apply the given map of family->edits to the memstore.
3709    * This handles the consistency control on its own, but the caller
3710    * should already have locked updatesLock.readLock(). This also does
3711    * <b>not</b> check the families for validity.
3712    *
3713    * @param familyMap Map of kvs per family
3714    * @param mvccNum The MVCC for this transaction.
3715    * @param isInReplay true when adding replayed KVs into memstore
3716    * @return the additional memory usage of the memstore caused by the
3717    * new entries.
3718    */
3719   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
3720     long mvccNum, List<Cell> memstoreCells, boolean isInReplay) throws IOException {
3721     long size = 0;
3722 
3723     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3724       byte[] family = e.getKey();
3725       List<Cell> cells = e.getValue();
3726       assert cells instanceof RandomAccess;
3727       Store store = getStore(family);
3728       int listSize = cells.size();
3729       for (int i=0; i < listSize; i++) {
3730         Cell cell = cells.get(i);
3731         CellUtil.setSequenceId(cell, mvccNum);
3732         Pair<Long, Cell> ret = store.add(cell);
3733         size += ret.getFirst();
3734         memstoreCells.add(ret.getSecond());
3735         if(isInReplay) {
3736           // set memstore newly added cells with replay mvcc number
3737           CellUtil.setSequenceId(ret.getSecond(), mvccNum);
3738         }
3739       }
3740     }
3741 
3742      return size;
3743    }
3744 
3745   /**
3746    * Remove all the keys listed in the map from the memstore. This method is
3747    * called when a Put/Delete has updated memstore but subsequently fails to update
3748    * the wal. This method is then invoked to rollback the memstore.
3749    */
3750   private void rollbackMemstore(List<Cell> memstoreCells) {
3751     int kvsRolledback = 0;
3752 
3753     for (Cell cell : memstoreCells) {
3754       byte[] family = CellUtil.cloneFamily(cell);
3755       Store store = getStore(family);
3756       store.rollback(cell);
3757       kvsRolledback++;
3758     }
3759     LOG.debug("rollbackMemstore rolled back " + kvsRolledback);
3760   }
3761 
3762   /**
3763    * Check the collection of families for validity.
3764    * @throws NoSuchColumnFamilyException if a family does not exist.
3765    */
3766   void checkFamilies(Collection<byte[]> families)
3767   throws NoSuchColumnFamilyException {
3768     for (byte[] family : families) {
3769       checkFamily(family);
3770     }
3771   }
3772 
3773   /**
3774    * During replay, there could exist column families which are removed between region server
3775    * failure and replay
3776    */
3777   private void removeNonExistentColumnFamilyForReplay(
3778       final Map<byte[], List<Cell>> familyMap) {
3779     List<byte[]> nonExistentList = null;
3780     for (byte[] family : familyMap.keySet()) {
3781       if (!this.htableDescriptor.hasFamily(family)) {
3782         if (nonExistentList == null) {
3783           nonExistentList = new ArrayList<byte[]>();
3784         }
3785         nonExistentList.add(family);
3786       }
3787     }
3788     if (nonExistentList != null) {
3789       for (byte[] family : nonExistentList) {
3790         // Perhaps schema was changed between crash and replay
3791         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3792         familyMap.remove(family);
3793       }
3794     }
3795   }
3796 
3797   void checkTimestamps(final Map<byte[], List<Cell>> familyMap,
3798       long now) throws FailedSanityCheckException {
3799     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
3800       return;
3801     }
3802     long maxTs = now + timestampSlop;
3803     for (List<Cell> kvs : familyMap.values()) {
3804       assert kvs instanceof RandomAccess;
3805       int listSize  = kvs.size();
3806       for (int i=0; i < listSize; i++) {
3807         Cell cell = kvs.get(i);
3808         // see if the user-side TS is out of range. latest = server-side
3809         long ts = cell.getTimestamp();
3810         if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
3811           throw new FailedSanityCheckException("Timestamp for KV out of range "
3812               + cell + " (too.new=" + timestampSlop + ")");
3813         }
3814       }
3815     }
3816   }
3817 
3818   /**
3819    * Append the given map of family->edits to a WALEdit data structure.
3820    * This does not write to the WAL itself.
3821    * @param familyMap map of family->edits
3822    * @param walEdit the destination entry to append into
3823    */
3824   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
3825       WALEdit walEdit) {
3826     for (List<Cell> edits : familyMap.values()) {
3827       assert edits instanceof RandomAccess;
3828       int listSize = edits.size();
3829       for (int i=0; i < listSize; i++) {
3830         Cell cell = edits.get(i);
3831         walEdit.add(cell);
3832       }
3833     }
3834   }
3835 
3836   private void requestFlush() {
3837     if (this.rsServices == null) {
3838       return;
3839     }
3840     synchronized (writestate) {
3841       if (this.writestate.isFlushRequested()) {
3842         return;
3843       }
3844       writestate.flushRequested = true;
3845     }
3846     // Make request outside of synchronize block; HBASE-818.
3847     this.rsServices.getFlushRequester().requestFlush(this, false);
3848     if (LOG.isDebugEnabled()) {
3849       LOG.debug("Flush requested on " + this);
3850     }
3851   }
3852 
3853   /*
3854    * @param size
3855    * @return True if size is over the flush threshold
3856    */
3857   private boolean isFlushSize(final long size) {
3858     return size > this.memstoreFlushSize;
3859   }
3860 
3861   /**
3862    * Read the edits put under this region by wal splitting process.  Put
3863    * the recovered edits back up into this region.
3864    *
3865    * <p>We can ignore any wal message that has a sequence ID that's equal to or
3866    * lower than minSeqId.  (Because we know such messages are already
3867    * reflected in the HFiles.)
3868    *
3869    * <p>While this is running we are putting pressure on memory yet we are
3870    * outside of our usual accounting because we are not yet an onlined region
3871    * (this stuff is being run as part of Region initialization).  This means
3872    * that if we're up against global memory limits, we'll not be flagged to flush
3873    * because we are not online. We can't be flushed by usual mechanisms anyways;
3874    * we're not yet online so our relative sequenceids are not yet aligned with
3875    * WAL sequenceids -- not till we come up online, post processing of split
3876    * edits.
3877    *
3878    * <p>But to help relieve memory pressure, at least manage our own heap size
3879    * flushing if are in excess of per-region limits.  Flushing, though, we have
3880    * to be careful and avoid using the regionserver/wal sequenceid.  Its running
3881    * on a different line to whats going on in here in this region context so if we
3882    * crashed replaying these edits, but in the midst had a flush that used the
3883    * regionserver wal with a sequenceid in excess of whats going on in here
3884    * in this region and with its split editlogs, then we could miss edits the
3885    * next time we go to recover. So, we have to flush inline, using seqids that
3886    * make sense in a this single region context only -- until we online.
3887    *
3888    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
3889    * the maxSeqId for the store to be applied, else its skipped.
3890    * @return the sequence id of the last edit added to this region out of the
3891    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3892    * @throws UnsupportedEncodingException
3893    * @throws IOException
3894    */
3895   protected long replayRecoveredEditsIfAny(final Path regiondir,
3896       Map<byte[], Long> maxSeqIdInStores,
3897       final CancelableProgressable reporter, final MonitoredTask status)
3898       throws IOException {
3899     long minSeqIdForTheRegion = -1;
3900     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3901       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3902         minSeqIdForTheRegion = maxSeqIdInStore;
3903       }
3904     }
3905     long seqid = minSeqIdForTheRegion;
3906 
3907     FileSystem fs = this.fs.getFileSystem();
3908     NavigableSet<Path> files = WALSplitter.getSplitEditFilesSorted(fs, regiondir);
3909     if (LOG.isDebugEnabled()) {
3910       LOG.debug("Found " + (files == null ? 0 : files.size())
3911         + " recovered edits file(s) under " + regiondir);
3912     }
3913 
3914     if (files == null || files.isEmpty()) return seqid;
3915 
3916     for (Path edits: files) {
3917       if (edits == null || !fs.exists(edits)) {
3918         LOG.warn("Null or non-existent edits file: " + edits);
3919         continue;
3920       }
3921       if (isZeroLengthThenDelete(fs, edits)) continue;
3922 
3923       long maxSeqId;
3924       String fileName = edits.getName();
3925       maxSeqId = Math.abs(Long.parseLong(fileName));
3926       if (maxSeqId <= minSeqIdForTheRegion) {
3927         if (LOG.isDebugEnabled()) {
3928           String msg = "Maximum sequenceid for this wal is " + maxSeqId
3929             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3930             + ", skipped the whole file, path=" + edits;
3931           LOG.debug(msg);
3932         }
3933         continue;
3934       }
3935 
3936       try {
3937         // replay the edits. Replay can return -1 if everything is skipped, only update
3938         // if seqId is greater
3939         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
3940       } catch (IOException e) {
3941         boolean skipErrors = conf.getBoolean(
3942             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3943             conf.getBoolean(
3944                 "hbase.skip.errors",
3945                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
3946         if (conf.get("hbase.skip.errors") != null) {
3947           LOG.warn(
3948               "The property 'hbase.skip.errors' has been deprecated. Please use " +
3949               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
3950         }
3951         if (skipErrors) {
3952           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
3953           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
3954               + "=true so continuing. Renamed " + edits +
3955               " as " + p, e);
3956         } else {
3957           throw e;
3958         }
3959       }
3960     }
3961     // The edits size added into rsAccounting during this replaying will not
3962     // be required any more. So just clear it.
3963     if (this.rsAccounting != null) {
3964       this.rsAccounting.clearRegionReplayEditsSize(this.getRegionName());
3965     }
3966     if (seqid > minSeqIdForTheRegion) {
3967       // Then we added some edits to memory. Flush and cleanup split edit files.
3968       internalFlushcache(null, seqid, stores.values(), status, false);
3969     }
3970     // Now delete the content of recovered edits.  We're done w/ them.
3971     if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
3972       // For debugging data loss issues!
3973       // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
3974       // column family. Have to fake out file type too by casting our recovered.edits as storefiles
3975       String fakeFamilyName = WALSplitter.getRegionDirRecoveredEditsDir(regiondir).getName();
3976       Set<StoreFile> fakeStoreFiles = new HashSet<StoreFile>(files.size());
3977       for (Path file: files) {
3978         fakeStoreFiles.add(new StoreFile(getRegionFileSystem().getFileSystem(), file, this.conf,
3979           null, null));
3980       }
3981       getRegionFileSystem().removeStoreFiles(fakeFamilyName, fakeStoreFiles);
3982     } else {
3983       for (Path file: files) {
3984         if (!fs.delete(file, false)) {
3985           LOG.error("Failed delete of " + file);
3986         } else {
3987           LOG.debug("Deleted recovered.edits file=" + file);
3988         }
3989       }
3990     }
3991     return seqid;
3992   }
3993 
3994   /*
3995    * @param edits File of recovered edits.
3996    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in wal
3997    * must be larger than this to be replayed for each store.
3998    * @param reporter
3999    * @return the sequence id of the last edit added to this region out of the
4000    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
4001    * @throws IOException
4002    */
4003   private long replayRecoveredEdits(final Path edits,
4004       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
4005     throws IOException {
4006     String msg = "Replaying edits from " + edits;
4007     LOG.info(msg);
4008     MonitoredTask status = TaskMonitor.get().createStatus(msg);
4009     FileSystem fs = this.fs.getFileSystem();
4010 
4011     status.setStatus("Opening recovered edits");
4012     WAL.Reader reader = null;
4013     try {
4014       reader = WALFactory.createReader(fs, edits, conf);
4015       long currentEditSeqId = -1;
4016       long currentReplaySeqId = -1;
4017       long firstSeqIdInLog = -1;
4018       long skippedEdits = 0;
4019       long editsCount = 0;
4020       long intervalEdits = 0;
4021       WAL.Entry entry;
4022       Store store = null;
4023       boolean reported_once = false;
4024       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
4025 
4026       try {
4027         // How many edits seen before we check elapsed time
4028         int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
4029         // How often to send a progress report (default 1/2 master timeout)
4030         int period = this.conf.getInt("hbase.hstore.report.period", 300000);
4031         long lastReport = EnvironmentEdgeManager.currentTime();
4032 
4033         while ((entry = reader.next()) != null) {
4034           WALKey key = entry.getKey();
4035           WALEdit val = entry.getEdit();
4036 
4037           if (ng != null) { // some test, or nonces disabled
4038             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
4039           }
4040 
4041           if (reporter != null) {
4042             intervalEdits += val.size();
4043             if (intervalEdits >= interval) {
4044               // Number of edits interval reached
4045               intervalEdits = 0;
4046               long cur = EnvironmentEdgeManager.currentTime();
4047               if (lastReport + period <= cur) {
4048                 status.setStatus("Replaying edits..." +
4049                     " skipped=" + skippedEdits +
4050                     " edits=" + editsCount);
4051                 // Timeout reached
4052                 if(!reporter.progress()) {
4053                   msg = "Progressable reporter failed, stopping replay";
4054                   LOG.warn(msg);
4055                   status.abort(msg);
4056                   throw new IOException(msg);
4057                 }
4058                 reported_once = true;
4059                 lastReport = cur;
4060               }
4061             }
4062           }
4063 
4064           if (firstSeqIdInLog == -1) {
4065             firstSeqIdInLog = key.getLogSeqNum();
4066           }
4067           if (currentEditSeqId > key.getLogSeqNum()) {
4068             // when this condition is true, it means we have a serious defect because we need to
4069             // maintain increasing SeqId for WAL edits per region
4070             LOG.error(getRegionInfo().getEncodedName() + " : "
4071                  + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
4072                 + "; edit=" + val);
4073           } else {
4074             currentEditSeqId = key.getLogSeqNum();
4075           }
4076           currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
4077             key.getOrigLogSeqNum() : currentEditSeqId;
4078 
4079           // Start coprocessor replay here. The coprocessor is for each WALEdit
4080           // instead of a KeyValue.
4081           if (coprocessorHost != null) {
4082             status.setStatus("Running pre-WAL-restore hook in coprocessors");
4083             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
4084               // if bypass this wal entry, ignore it ...
4085               continue;
4086             }
4087           }
4088           // Check this edit is for this region.
4089           if (!Bytes.equals(key.getEncodedRegionName(),
4090               this.getRegionInfo().getEncodedNameAsBytes())) {
4091             skippedEdits++;
4092             continue;
4093           }
4094 
4095           boolean flush = false;
4096           for (Cell cell: val.getCells()) {
4097             // Check this edit is for me. Also, guard against writing the special
4098             // METACOLUMN info such as HBASE::CACHEFLUSH entries
4099             if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
4100               //this is a special edit, we should handle it
4101               CompactionDescriptor compaction = WALEdit.getCompaction(cell);
4102               if (compaction != null) {
4103                 //replay the compaction
4104                 replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
4105               }
4106               skippedEdits++;
4107               continue;
4108             }
4109             // Figure which store the edit is meant for.
4110             if (store == null || !CellUtil.matchingFamily(cell, store.getFamily().getName())) {
4111               store = getStore(cell);
4112             }
4113             if (store == null) {
4114               // This should never happen.  Perhaps schema was changed between
4115               // crash and redeploy?
4116               LOG.warn("No family for " + cell);
4117               skippedEdits++;
4118               continue;
4119             }
4120             // Now, figure if we should skip this edit.
4121             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
4122                 .getName())) {
4123               skippedEdits++;
4124               continue;
4125             }
4126             CellUtil.setSequenceId(cell, currentReplaySeqId);
4127 
4128             // Once we are over the limit, restoreEdit will keep returning true to
4129             // flush -- but don't flush until we've played all the kvs that make up
4130             // the WALEdit.
4131             flush |= restoreEdit(store, cell);
4132             editsCount++;
4133           }
4134           if (flush) {
4135             internalFlushcache(null, currentEditSeqId, stores.values(), status, false);
4136           }
4137 
4138           if (coprocessorHost != null) {
4139             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
4140           }
4141         }
4142       } catch (EOFException eof) {
4143         Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4144         msg = "Encountered EOF. Most likely due to Master failure during " +
4145             "wal splitting, so we have this data in another edit.  " +
4146             "Continuing, but renaming " + edits + " as " + p;
4147         LOG.warn(msg, eof);
4148         status.abort(msg);
4149       } catch (IOException ioe) {
4150         // If the IOE resulted from bad file format,
4151         // then this problem is idempotent and retrying won't help
4152         if (ioe.getCause() instanceof ParseException) {
4153           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4154           msg = "File corruption encountered!  " +
4155               "Continuing, but renaming " + edits + " as " + p;
4156           LOG.warn(msg, ioe);
4157           status.setStatus(msg);
4158         } else {
4159           status.abort(StringUtils.stringifyException(ioe));
4160           // other IO errors may be transient (bad network connection,
4161           // checksum exception on one datanode, etc).  throw & retry
4162           throw ioe;
4163         }
4164       }
4165       if (reporter != null && !reported_once) {
4166         reporter.progress();
4167       }
4168       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
4169         ", firstSequenceIdInLog=" + firstSeqIdInLog +
4170         ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
4171       status.markComplete(msg);
4172       LOG.debug(msg);
4173       return currentEditSeqId;
4174     } finally {
4175       status.cleanup();
4176       if (reader != null) {
4177          reader.close();
4178       }
4179     }
4180   }
4181 
4182   /**
4183    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
4184    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
4185    * See HBASE-2331.
4186    */
4187   void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
4188       boolean removeFiles, long replaySeqId)
4189       throws IOException {
4190     checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
4191       "Compaction marker from WAL ", compaction);
4192 
4193     synchronized (writestate) {
4194       if (replaySeqId < lastReplayedOpenRegionSeqId) {
4195         LOG.warn(getRegionInfo().getEncodedName() + " : "
4196             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4197             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4198             + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4199         return;
4200       }
4201       if (replaySeqId < lastReplayedCompactionSeqId) {
4202         LOG.warn(getRegionInfo().getEncodedName() + " : "
4203             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4204             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4205             + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
4206         return;
4207       } else {
4208         lastReplayedCompactionSeqId = replaySeqId;
4209       }
4210 
4211       if (LOG.isDebugEnabled()) {
4212         LOG.debug(getRegionInfo().getEncodedName() + " : "
4213             + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
4214             + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
4215             + lastReplayedOpenRegionSeqId);
4216       }
4217 
4218       startRegionOperation(Operation.REPLAY_EVENT);
4219       try {
4220         Store store = this.getStore(compaction.getFamilyName().toByteArray());
4221         if (store == null) {
4222           LOG.warn(getRegionInfo().getEncodedName() + " : "
4223               + "Found Compaction WAL edit for deleted family:"
4224               + Bytes.toString(compaction.getFamilyName().toByteArray()));
4225           return;
4226         }
4227         store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
4228         logRegionFiles();
4229       } finally {
4230         closeRegionOperation(Operation.REPLAY_EVENT);
4231       }
4232     }
4233   }
4234 
4235   void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
4236     checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
4237       "Flush marker from WAL ", flush);
4238 
4239     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4240       return; // if primary nothing to do
4241     }
4242 
4243     if (LOG.isDebugEnabled()) {
4244       LOG.debug(getRegionInfo().getEncodedName() + " : "
4245           + "Replaying flush marker " + TextFormat.shortDebugString(flush));
4246     }
4247 
4248     startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
4249     try {
4250       FlushAction action = flush.getAction();
4251       switch (action) {
4252       case START_FLUSH:
4253         replayWALFlushStartMarker(flush);
4254         break;
4255       case COMMIT_FLUSH:
4256         replayWALFlushCommitMarker(flush);
4257         break;
4258       case ABORT_FLUSH:
4259         replayWALFlushAbortMarker(flush);
4260         break;
4261       case CANNOT_FLUSH:
4262         replayWALFlushCannotFlushMarker(flush, replaySeqId);
4263         break;
4264       default:
4265         LOG.warn(getRegionInfo().getEncodedName() + " : " +
4266           "Received a flush event with unknown action, ignoring. " +
4267           TextFormat.shortDebugString(flush));
4268         break;
4269       }
4270 
4271       logRegionFiles();
4272     } finally {
4273       closeRegionOperation(Operation.REPLAY_EVENT);
4274     }
4275   }
4276 
4277   /** Replay the flush marker from primary region by creating a corresponding snapshot of
4278    * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
4279    * edit (because the events may be coming out of order).
4280    */
4281   @VisibleForTesting
4282   PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
4283     long flushSeqId = flush.getFlushSequenceNumber();
4284 
4285     HashSet<Store> storesToFlush = new HashSet<Store>();
4286     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4287       byte[] family = storeFlush.getFamilyName().toByteArray();
4288       Store store = getStore(family);
4289       if (store == null) {
4290         LOG.warn(getRegionInfo().getEncodedName() + " : "
4291           + "Received a flush start marker from primary, but the family is not found. Ignoring"
4292           + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
4293         continue;
4294       }
4295       storesToFlush.add(store);
4296     }
4297 
4298     MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
4299 
4300     // we will use writestate as a coarse-grain lock for all the replay events
4301     // (flush, compaction, region open etc)
4302     synchronized (writestate) {
4303       try {
4304         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4305           LOG.warn(getRegionInfo().getEncodedName() + " : "
4306               + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4307               + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4308               + " of " + lastReplayedOpenRegionSeqId);
4309           return null;
4310         }
4311         if (numMutationsWithoutWAL.get() > 0) {
4312           numMutationsWithoutWAL.set(0);
4313           dataInMemoryWithoutWAL.set(0);
4314         }
4315 
4316         if (!writestate.flushing) {
4317           // we do not have an active snapshot and corresponding this.prepareResult. This means
4318           // we can just snapshot our memstores and continue as normal.
4319 
4320           // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
4321           PrepareFlushResult prepareResult = internalPrepareFlushCache(null,
4322             flushSeqId, storesToFlush, status, false);
4323           if (prepareResult.result == null) {
4324             // save the PrepareFlushResult so that we can use it later from commit flush
4325             this.writestate.flushing = true;
4326             this.prepareFlushResult = prepareResult;
4327             status.markComplete("Flush prepare successful");
4328             if (LOG.isDebugEnabled()) {
4329               LOG.debug(getRegionInfo().getEncodedName() + " : "
4330                   + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
4331             }
4332           } else {
4333             // special case empty memstore. We will still save the flush result in this case, since
4334             // our memstore ie empty, but the primary is still flushing
4335             if (prepareResult.result.result == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
4336               this.writestate.flushing = true;
4337               this.prepareFlushResult = prepareResult;
4338               if (LOG.isDebugEnabled()) {
4339                 LOG.debug(getRegionInfo().getEncodedName() + " : "
4340                   + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
4341               }
4342             }
4343             status.abort("Flush prepare failed with " + prepareResult.result);
4344             // nothing much to do. prepare flush failed because of some reason.
4345           }
4346           return prepareResult;
4347         } else {
4348           // we already have an active snapshot.
4349           if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
4350             // They define the same flush. Log and continue.
4351             LOG.warn(getRegionInfo().getEncodedName() + " : "
4352                 + "Received a flush prepare marker with the same seqId: " +
4353                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4354                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4355             // ignore
4356           } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
4357             // We received a flush with a smaller seqNum than what we have prepared. We can only
4358             // ignore this prepare flush request.
4359             LOG.warn(getRegionInfo().getEncodedName() + " : "
4360                 + "Received a flush prepare marker with a smaller seqId: " +
4361                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4362                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4363             // ignore
4364           } else {
4365             // We received a flush with a larger seqNum than what we have prepared
4366             LOG.warn(getRegionInfo().getEncodedName() + " : "
4367                 + "Received a flush prepare marker with a larger seqId: " +
4368                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4369                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4370             // We do not have multiple active snapshots in the memstore or a way to merge current
4371             // memstore snapshot with the contents and resnapshot for now. We cannot take
4372             // another snapshot and drop the previous one because that will cause temporary
4373             // data loss in the secondary. So we ignore this for now, deferring the resolution
4374             // to happen when we see the corresponding flush commit marker. If we have a memstore
4375             // snapshot with x, and later received another prepare snapshot with y (where x < y),
4376             // when we see flush commit for y, we will drop snapshot for x, and can also drop all
4377             // the memstore edits if everything in memstore is < y. This is the usual case for
4378             // RS crash + recovery where we might see consequtive prepare flush wal markers.
4379             // Otherwise, this will cause more memory to be used in secondary replica until a
4380             // further prapare + commit flush is seen and replayed.
4381           }
4382         }
4383       } finally {
4384         status.cleanup();
4385         writestate.notifyAll();
4386       }
4387     }
4388     return null;
4389   }
4390 
4391   @VisibleForTesting
4392   void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
4393     MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
4394 
4395     // check whether we have the memstore snapshot with the corresponding seqId. Replay to
4396     // secondary region replicas are in order, except for when the region moves or then the
4397     // region server crashes. In those cases, we may receive replay requests out of order from
4398     // the original seqIds.
4399     synchronized (writestate) {
4400       try {
4401         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4402           LOG.warn(getRegionInfo().getEncodedName() + " : "
4403             + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4404             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4405             + " of " + lastReplayedOpenRegionSeqId);
4406           return;
4407         }
4408 
4409         if (writestate.flushing) {
4410           PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
4411           if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
4412             if (LOG.isDebugEnabled()) {
4413               LOG.debug(getRegionInfo().getEncodedName() + " : "
4414                   + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4415                   + " and a previous prepared snapshot was found");
4416             }
4417             // This is the regular case where we received commit flush after prepare flush
4418             // corresponding to the same seqId.
4419             replayFlushInStores(flush, prepareFlushResult, true);
4420 
4421             // Set down the memstore size by amount of flush.
4422             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4423 
4424             this.prepareFlushResult = null;
4425             writestate.flushing = false;
4426           } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
4427             // This should not happen normally. However, lets be safe and guard against these cases
4428             // we received a flush commit with a smaller seqId than what we have prepared
4429             // we will pick the flush file up from this commit (if we have not seen it), but we
4430             // will not drop the memstore
4431             LOG.warn(getRegionInfo().getEncodedName() + " : "
4432                 + "Received a flush commit marker with smaller seqId: "
4433                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
4434                 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
4435                 +"  prepared memstore snapshot");
4436             replayFlushInStores(flush, prepareFlushResult, false);
4437 
4438             // snapshot is not dropped, so memstore sizes should not be decremented
4439             // we still have the prepared snapshot, flushing should still be true
4440           } else {
4441             // This should not happen normally. However, lets be safe and guard against these cases
4442             // we received a flush commit with a larger seqId than what we have prepared
4443             // we will pick the flush file for this. We will also obtain the updates lock and
4444             // look for contents of the memstore to see whether we have edits after this seqId.
4445             // If not, we will drop all the memstore edits and the snapshot as well.
4446             LOG.warn(getRegionInfo().getEncodedName() + " : "
4447                 + "Received a flush commit marker with larger seqId: "
4448                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
4449                 prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
4450                 +" memstore snapshot");
4451 
4452             replayFlushInStores(flush, prepareFlushResult, true);
4453 
4454             // Set down the memstore size by amount of flush.
4455             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4456 
4457             // Inspect the memstore contents to see whether the memstore contains only edits
4458             // with seqId smaller than the flush seqId. If so, we can discard those edits.
4459             dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4460 
4461             this.prepareFlushResult = null;
4462             writestate.flushing = false;
4463           }
4464           // If we were waiting for observing a flush or region opening event for not showing
4465           // partial data after a secondary region crash, we can allow reads now. We can only make
4466           // sure that we are not showing partial data (for example skipping some previous edits)
4467           // until we observe a full flush start and flush commit. So if we were not able to find
4468           // a previous flush we will not enable reads now.
4469           this.setReadsEnabled(true);
4470         } else {
4471           LOG.warn(getRegionInfo().getEncodedName() + " : "
4472               + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4473               + ", but no previous prepared snapshot was found");
4474           // There is no corresponding prepare snapshot from before.
4475           // We will pick up the new flushed file
4476           replayFlushInStores(flush, null, false);
4477 
4478           // Inspect the memstore contents to see whether the memstore contains only edits
4479           // with seqId smaller than the flush seqId. If so, we can discard those edits.
4480           dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4481         }
4482 
4483         status.markComplete("Flush commit successful");
4484 
4485         // Update the last flushed sequence id for region.
4486         this.maxFlushedSeqId = flush.getFlushSequenceNumber();
4487 
4488         // advance the mvcc read point so that the new flushed file is visible.
4489         // there may be some in-flight transactions, but they won't be made visible since they are
4490         // either greater than flush seq number or they were already dropped via flush.
4491         // TODO: If we are using FlushAllStoresPolicy, then this can make edits visible from other
4492         // stores while they are still in flight because the flush commit marker will not contain
4493         // flushes from ALL stores.
4494         getMVCC().advanceMemstoreReadPointIfNeeded(flush.getFlushSequenceNumber());
4495 
4496         // C. Finally notify anyone waiting on memstore to clear:
4497         // e.g. checkResources().
4498         synchronized (this) {
4499           notifyAll(); // FindBugs NN_NAKED_NOTIFY
4500         }
4501       } finally {
4502         status.cleanup();
4503         writestate.notifyAll();
4504       }
4505     }
4506   }
4507 
4508   /**
4509    * Replays the given flush descriptor by opening the flush files in stores and dropping the
4510    * memstore snapshots if requested.
4511    * @param flush
4512    * @param prepareFlushResult
4513    * @param dropMemstoreSnapshot
4514    * @throws IOException
4515    */
4516   private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
4517       boolean dropMemstoreSnapshot)
4518       throws IOException {
4519     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4520       byte[] family = storeFlush.getFamilyName().toByteArray();
4521       Store store = getStore(family);
4522       if (store == null) {
4523         LOG.warn(getRegionInfo().getEncodedName() + " : "
4524             + "Received a flush commit marker from primary, but the family is not found."
4525             + "Ignoring StoreFlushDescriptor:" + storeFlush);
4526         continue;
4527       }
4528       List<String> flushFiles = storeFlush.getFlushOutputList();
4529       StoreFlushContext ctx = null;
4530       long startTime = EnvironmentEdgeManager.currentTime();
4531       if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
4532         ctx = store.createFlushContext(flush.getFlushSequenceNumber());
4533       } else {
4534         ctx = prepareFlushResult.storeFlushCtxs.get(family);
4535         startTime = prepareFlushResult.startTime;
4536       }
4537 
4538       if (ctx == null) {
4539         LOG.warn(getRegionInfo().getEncodedName() + " : "
4540             + "Unexpected: flush commit marker received from store "
4541             + Bytes.toString(family) + " but no associated flush context. Ignoring");
4542         continue;
4543       }
4544       ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
4545 
4546       // Record latest flush time
4547       this.lastStoreFlushTimeMap.put(store, startTime);
4548     }
4549   }
4550 
4551   /**
4552    * Drops the memstore contents after replaying a flush descriptor or region open event replay
4553    * if the memstore edits have seqNums smaller than the given seq id
4554    * @param flush the flush descriptor
4555    * @throws IOException
4556    */
4557   private long dropMemstoreContentsForSeqId(long seqId, Store store) throws IOException {
4558     long totalFreedSize = 0;
4559     this.updatesLock.writeLock().lock();
4560     try {
4561       mvcc.waitForPreviousTransactionsComplete();
4562       long currentSeqId = getSequenceId().get();
4563       if (seqId >= currentSeqId) {
4564         // then we can drop the memstore contents since everything is below this seqId
4565         LOG.info(getRegionInfo().getEncodedName() + " : "
4566             + "Dropping memstore contents as well since replayed flush seqId: "
4567             + seqId + " is greater than current seqId:" + currentSeqId);
4568 
4569         // Prepare flush (take a snapshot) and then abort (drop the snapshot)
4570         if (store == null ) {
4571           for (Store s : stores.values()) {
4572             totalFreedSize += doDropStoreMemstoreContentsForSeqId(s, currentSeqId);
4573           }
4574         } else {
4575           totalFreedSize += doDropStoreMemstoreContentsForSeqId(store, currentSeqId);
4576         }
4577       } else {
4578         LOG.info(getRegionInfo().getEncodedName() + " : "
4579             + "Not dropping memstore contents since replayed flush seqId: "
4580             + seqId + " is smaller than current seqId:" + currentSeqId);
4581       }
4582     } finally {
4583       this.updatesLock.writeLock().unlock();
4584     }
4585     return totalFreedSize;
4586   }
4587 
4588   private long doDropStoreMemstoreContentsForSeqId(Store s, long currentSeqId) throws IOException {
4589     long snapshotSize = s.getFlushableSize();
4590     this.addAndGetGlobalMemstoreSize(-snapshotSize);
4591     StoreFlushContext ctx = s.createFlushContext(currentSeqId);
4592     ctx.prepare();
4593     ctx.abort();
4594     return snapshotSize;
4595   }
4596 
4597   private void replayWALFlushAbortMarker(FlushDescriptor flush) {
4598     // nothing to do for now. A flush abort will cause a RS abort which means that the region
4599     // will be opened somewhere else later. We will see the region open event soon, and replaying
4600     // that will drop the snapshot
4601   }
4602 
4603   private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
4604     synchronized (writestate) {
4605       if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
4606         LOG.warn(getRegionInfo().getEncodedName() + " : "
4607           + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4608           + " because its sequence id " + replaySeqId + " is smaller than this regions "
4609           + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4610         return;
4611       }
4612 
4613       // If we were waiting for observing a flush or region opening event for not showing partial
4614       // data after a secondary region crash, we can allow reads now. This event means that the
4615       // primary was not able to flush because memstore is empty when we requested flush. By the
4616       // time we observe this, we are guaranteed to have up to date seqId with our previous
4617       // assignment.
4618       this.setReadsEnabled(true);
4619     }
4620   }
4621 
4622   @VisibleForTesting
4623   PrepareFlushResult getPrepareFlushResult() {
4624     return prepareFlushResult;
4625   }
4626 
4627   void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
4628     checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
4629       "RegionEvent marker from WAL ", regionEvent);
4630 
4631     startRegionOperation(Operation.REPLAY_EVENT);
4632     try {
4633       if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4634         return; // if primary nothing to do
4635       }
4636 
4637       if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
4638         // nothing to do on REGION_CLOSE for now.
4639         return;
4640       }
4641       if (regionEvent.getEventType() != EventType.REGION_OPEN) {
4642         LOG.warn(getRegionInfo().getEncodedName() + " : "
4643             + "Unknown region event received, ignoring :"
4644             + TextFormat.shortDebugString(regionEvent));
4645         return;
4646       }
4647 
4648       if (LOG.isDebugEnabled()) {
4649         LOG.debug(getRegionInfo().getEncodedName() + " : "
4650           + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
4651       }
4652 
4653       // we will use writestate as a coarse-grain lock for all the replay events
4654       synchronized (writestate) {
4655         // Replication can deliver events out of order when primary region moves or the region
4656         // server crashes, since there is no coordination between replication of different wal files
4657         // belonging to different region servers. We have to safe guard against this case by using
4658         // region open event's seqid. Since this is the first event that the region puts (after
4659         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4660         // smaller than this seqId
4661         if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
4662           this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
4663         } else {
4664           LOG.warn(getRegionInfo().getEncodedName() + " : "
4665             + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
4666             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4667             + " of " + lastReplayedOpenRegionSeqId);
4668           return;
4669         }
4670 
4671         // region open lists all the files that the region has at the time of the opening. Just pick
4672         // all the files and drop prepared flushes and empty memstores
4673         for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
4674           // stores of primary may be different now
4675           byte[] family = storeDescriptor.getFamilyName().toByteArray();
4676           Store store = getStore(family);
4677           if (store == null) {
4678             LOG.warn(getRegionInfo().getEncodedName() + " : "
4679                 + "Received a region open marker from primary, but the family is not found. "
4680                 + "Ignoring. StoreDescriptor:" + storeDescriptor);
4681             continue;
4682           }
4683 
4684           long storeSeqId = store.getMaxSequenceId();
4685           List<String> storeFiles = storeDescriptor.getStoreFileList();
4686           store.refreshStoreFiles(storeFiles); // replace the files with the new ones
4687           if (store.getMaxSequenceId() != storeSeqId) {
4688             // Record latest flush time if we picked up new files
4689             lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
4690           }
4691 
4692           if (writestate.flushing) {
4693             // only drop memstore snapshots if they are smaller than last flush for the store
4694             if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
4695               StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4696                   null : this.prepareFlushResult.storeFlushCtxs.get(family);
4697               if (ctx != null) {
4698                 long snapshotSize = store.getFlushableSize();
4699                 ctx.abort();
4700                 this.addAndGetGlobalMemstoreSize(-snapshotSize);
4701                 this.prepareFlushResult.storeFlushCtxs.remove(family);
4702               }
4703             }
4704           }
4705 
4706           // Drop the memstore contents if they are now smaller than the latest seen flushed file
4707           dropMemstoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
4708           if (storeSeqId > this.maxFlushedSeqId) {
4709             this.maxFlushedSeqId = storeSeqId;
4710           }
4711         }
4712 
4713         // if all stores ended up dropping their snapshots, we can safely drop the
4714         // prepareFlushResult
4715         dropPrepareFlushIfPossible();
4716 
4717         // advance the mvcc read point so that the new flushed file is visible.
4718         // there may be some in-flight transactions, but they won't be made visible since they are
4719         // either greater than flush seq number or they were already dropped via flush.
4720         getMVCC().advanceMemstoreReadPointIfNeeded(this.maxFlushedSeqId);
4721 
4722         // If we were waiting for observing a flush or region opening event for not showing partial
4723         // data after a secondary region crash, we can allow reads now.
4724         this.setReadsEnabled(true);
4725 
4726         // C. Finally notify anyone waiting on memstore to clear:
4727         // e.g. checkResources().
4728         synchronized (this) {
4729           notifyAll(); // FindBugs NN_NAKED_NOTIFY
4730         }
4731       }
4732       logRegionFiles();
4733     } finally {
4734       closeRegionOperation(Operation.REPLAY_EVENT);
4735     }
4736   }
4737 
4738   void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
4739     checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
4740       "BulkLoad marker from WAL ", bulkLoadEvent);
4741 
4742     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4743       return; // if primary nothing to do
4744     }
4745 
4746     if (LOG.isDebugEnabled()) {
4747       LOG.debug(getRegionInfo().getEncodedName() + " : "
4748               +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
4749     }
4750     // check if multiple families involved
4751     boolean multipleFamilies = false;
4752     byte[] family = null;
4753     for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4754       byte[] fam = storeDescriptor.getFamilyName().toByteArray();
4755       if (family == null) {
4756         family = fam;
4757       } else if (!Bytes.equals(family, fam)) {
4758         multipleFamilies = true;
4759         break;
4760       }
4761     }
4762 
4763     startBulkRegionOperation(multipleFamilies);
4764     try {
4765       // we will use writestate as a coarse-grain lock for all the replay events
4766       synchronized (writestate) {
4767         // Replication can deliver events out of order when primary region moves or the region
4768         // server crashes, since there is no coordination between replication of different wal files
4769         // belonging to different region servers. We have to safe guard against this case by using
4770         // region open event's seqid. Since this is the first event that the region puts (after
4771         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4772         // smaller than this seqId
4773         if (bulkLoadEvent.getBulkloadSeqNum() >= 0
4774             && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
4775           LOG.warn(getRegionInfo().getEncodedName() + " : "
4776               + "Skipping replaying bulkload event :"
4777               + TextFormat.shortDebugString(bulkLoadEvent)
4778               + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
4779               + " =" + lastReplayedOpenRegionSeqId);
4780 
4781           return;
4782         }
4783 
4784         for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4785           // stores of primary may be different now
4786           family = storeDescriptor.getFamilyName().toByteArray();
4787           Store store = getStore(family);
4788           if (store == null) {
4789             LOG.warn(getRegionInfo().getEncodedName() + " : "
4790                     + "Received a bulk load marker from primary, but the family is not found. "
4791                     + "Ignoring. StoreDescriptor:" + storeDescriptor);
4792             continue;
4793           }
4794 
4795           List<String> storeFiles = storeDescriptor.getStoreFileList();
4796           for (String storeFile : storeFiles) {
4797             StoreFileInfo storeFileInfo = null;
4798             try {
4799               storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
4800               store.bulkLoadHFile(storeFileInfo);
4801             } catch(FileNotFoundException ex) {
4802               LOG.warn(getRegionInfo().getEncodedName() + " : "
4803                       + ((storeFileInfo != null) ? storeFileInfo.toString() :
4804                             (new Path(Bytes.toString(family), storeFile)).toString())
4805                       + " doesn't exist any more. Skip loading the file");
4806             }
4807           }
4808         }
4809       }
4810       if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
4811         getMVCC().advanceMemstoreReadPointIfNeeded(bulkLoadEvent.getBulkloadSeqNum());
4812       }
4813     } finally {
4814       closeBulkRegionOperation();
4815     }
4816   }
4817 
4818   /**
4819    * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
4820    */
4821   private void dropPrepareFlushIfPossible() {
4822     if (writestate.flushing) {
4823       boolean canDrop = true;
4824       if (prepareFlushResult.storeFlushCtxs != null) {
4825         for (Entry<byte[], StoreFlushContext> entry
4826             : prepareFlushResult.storeFlushCtxs.entrySet()) {
4827           Store store = getStore(entry.getKey());
4828           if (store == null) {
4829             continue;
4830           }
4831           if (store.getSnapshotSize() > 0) {
4832             canDrop = false;
4833             break;
4834           }
4835         }
4836       }
4837 
4838       // this means that all the stores in the region has finished flushing, but the WAL marker
4839       // may not have been written or we did not receive it yet.
4840       if (canDrop) {
4841         writestate.flushing = false;
4842         this.prepareFlushResult = null;
4843       }
4844     }
4845   }
4846 
4847   /**
4848    * Checks the underlying store files, and opens the files that  have not
4849    * been opened, and removes the store file readers for store files no longer
4850    * available. Mainly used by secondary region replicas to keep up to date with
4851    * the primary region files or open new flushed files and drop their memstore snapshots in case
4852    * of memory pressure.
4853    * @throws IOException
4854    */
4855   boolean refreshStoreFiles() throws IOException {
4856     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4857       return false; // if primary nothing to do
4858     }
4859 
4860     if (LOG.isDebugEnabled()) {
4861       LOG.debug(getRegionInfo().getEncodedName() + " : "
4862           + "Refreshing store files to see whether we can free up memstore");
4863     }
4864 
4865     long totalFreedSize = 0;
4866 
4867     long smallestSeqIdInStores = Long.MAX_VALUE;
4868 
4869     startRegionOperation(); // obtain region close lock
4870     try {
4871       synchronized (writestate) {
4872         for (Store store : getStores().values()) {
4873           // TODO: some stores might see new data from flush, while others do not which
4874           // MIGHT break atomic edits across column families.
4875           long maxSeqIdBefore = store.getMaxSequenceId();
4876 
4877           // refresh the store files. This is similar to observing a region open wal marker.
4878           store.refreshStoreFiles();
4879 
4880           long storeSeqId = store.getMaxSequenceId();
4881           if (storeSeqId < smallestSeqIdInStores) {
4882             smallestSeqIdInStores = storeSeqId;
4883           }
4884 
4885           // see whether we can drop the memstore or the snapshot
4886           if (storeSeqId > maxSeqIdBefore) {
4887 
4888             if (writestate.flushing) {
4889               // only drop memstore snapshots if they are smaller than last flush for the store
4890               if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
4891                 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4892                     null : this.prepareFlushResult.storeFlushCtxs.get(store.getFamily().getName());
4893                 if (ctx != null) {
4894                   long snapshotSize = store.getFlushableSize();
4895                   ctx.abort();
4896                   this.addAndGetGlobalMemstoreSize(-snapshotSize);
4897                   this.prepareFlushResult.storeFlushCtxs.remove(store.getFamily().getName());
4898                   totalFreedSize += snapshotSize;
4899                 }
4900               }
4901             }
4902 
4903             // Drop the memstore contents if they are now smaller than the latest seen flushed file
4904             totalFreedSize += dropMemstoreContentsForSeqId(storeSeqId, store);
4905           }
4906         }
4907 
4908         // if all stores ended up dropping their snapshots, we can safely drop the
4909         // prepareFlushResult
4910         dropPrepareFlushIfPossible();
4911 
4912         // advance the mvcc read point so that the new flushed files are visible.
4913         // there may be some in-flight transactions, but they won't be made visible since they are
4914         // either greater than flush seq number or they were already picked up via flush.
4915         for (Store s : getStores().values()) {
4916           getMVCC().advanceMemstoreReadPointIfNeeded(s.getMaxMemstoreTS());
4917         }
4918 
4919         // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
4920         // skip all edits that are to be replayed in the future with that has a smaller seqId
4921         // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
4922         // that we have picked the flush files for
4923         if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
4924           this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
4925         }
4926       }
4927       // C. Finally notify anyone waiting on memstore to clear:
4928       // e.g. checkResources().
4929       synchronized (this) {
4930         notifyAll(); // FindBugs NN_NAKED_NOTIFY
4931       }
4932       return totalFreedSize > 0;
4933     } finally {
4934       closeRegionOperation();
4935     }
4936   }
4937 
4938   private void logRegionFiles() {
4939     if (LOG.isTraceEnabled()) {
4940       LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
4941       for (Store s : stores.values()) {
4942         for (StoreFile sf : s.getStorefiles()) {
4943           LOG.trace(getRegionInfo().getEncodedName() + " : " + sf);
4944         }
4945       }
4946     }
4947   }
4948 
4949   /** Checks whether the given regionName is either equal to our region, or that
4950    * the regionName is the primary region to our corresponding range for the secondary replica.
4951    */
4952   private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
4953       throws WrongRegionException {
4954     if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
4955       return;
4956     }
4957 
4958     if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
4959         Bytes.equals(encodedRegionName,
4960           this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
4961       return;
4962     }
4963 
4964     throw new WrongRegionException(exceptionMsg + payload
4965       + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
4966       + " does not match this region: " + this.getRegionInfo());
4967   }
4968 
4969   /**
4970    * Used by tests
4971    * @param s Store to add edit too.
4972    * @param cell Cell to add.
4973    * @return True if we should flush.
4974    */
4975   protected boolean restoreEdit(final Store s, final Cell cell) {
4976     long kvSize = s.add(cell).getFirst();
4977     if (this.rsAccounting != null) {
4978       rsAccounting.addAndGetRegionReplayEditsSize(this.getRegionName(), kvSize);
4979     }
4980     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
4981   }
4982 
4983   /*
4984    * @param fs
4985    * @param p File to check.
4986    * @return True if file was zero-length (and if so, we'll delete it in here).
4987    * @throws IOException
4988    */
4989   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
4990       throws IOException {
4991     FileStatus stat = fs.getFileStatus(p);
4992     if (stat.getLen() > 0) return false;
4993     LOG.warn("File " + p + " is zero-length, deleting.");
4994     fs.delete(p, false);
4995     return true;
4996   }
4997 
4998   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
4999     return new HStore(this, family, this.conf);
5000   }
5001 
5002   /**
5003    * Return HStore instance.
5004    * Use with caution.  Exposed for use of fixup utilities.
5005    * @param column Name of column family hosted by this region.
5006    * @return Store that goes with the family on passed <code>column</code>.
5007    * TODO: Make this lookup faster.
5008    */
5009   public Store getStore(final byte[] column) {
5010     return this.stores.get(column);
5011   }
5012 
5013   /**
5014    * Return HStore instance. Does not do any copy: as the number of store is limited, we
5015    *  iterate on the list.
5016    */
5017   private Store getStore(Cell cell) {
5018     for (Map.Entry<byte[], Store> famStore : stores.entrySet()) {
5019       if (Bytes.equals(
5020           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
5021           famStore.getKey(), 0, famStore.getKey().length)) {
5022         return famStore.getValue();
5023       }
5024     }
5025 
5026     return null;
5027   }
5028 
5029   public Map<byte[], Store> getStores() {
5030     return this.stores;
5031   }
5032 
5033   /**
5034    * Return list of storeFiles for the set of CFs.
5035    * Uses closeLock to prevent the race condition where a region closes
5036    * in between the for loop - closing the stores one by one, some stores
5037    * will return 0 files.
5038    * @return List of storeFiles.
5039    */
5040   public List<String> getStoreFileList(final byte [][] columns)
5041     throws IllegalArgumentException {
5042     List<String> storeFileNames = new ArrayList<String>();
5043     synchronized(closeLock) {
5044       for(byte[] column : columns) {
5045         Store store = this.stores.get(column);
5046         if (store == null) {
5047           throw new IllegalArgumentException("No column family : " +
5048               new String(column) + " available");
5049         }
5050         for (StoreFile storeFile: store.getStorefiles()) {
5051           storeFileNames.add(storeFile.getPath().toString());
5052         }
5053 
5054         logRegionFiles();
5055       }
5056     }
5057     return storeFileNames;
5058   }
5059 
5060   //////////////////////////////////////////////////////////////////////////////
5061   // Support code
5062   //////////////////////////////////////////////////////////////////////////////
5063 
5064   /** Make sure this is a valid row for the HRegion */
5065   void checkRow(final byte [] row, String op) throws IOException {
5066     if (!rowIsInRange(getRegionInfo(), row)) {
5067       throw new WrongRegionException("Requested row out of range for " +
5068           op + " on HRegion " + this + ", startKey='" +
5069           Bytes.toStringBinary(getStartKey()) + "', getEndKey()='" +
5070           Bytes.toStringBinary(getEndKey()) + "', row='" +
5071           Bytes.toStringBinary(row) + "'");
5072     }
5073   }
5074 
5075   /**
5076    * Tries to acquire a lock on the given row.
5077    * @param waitForLock if true, will block until the lock is available.
5078    *        Otherwise, just tries to obtain the lock and returns
5079    *        false if unavailable.
5080    * @return the row lock if acquired,
5081    *   null if waitForLock was false and the lock was not acquired
5082    * @throws IOException if waitForLock was true and the lock could not be acquired after waiting
5083    */
5084   public RowLock getRowLock(byte[] row, boolean waitForLock) throws IOException {
5085     startRegionOperation();
5086     try {
5087       return getRowLockInternal(row, waitForLock);
5088     } finally {
5089       closeRegionOperation();
5090     }
5091   }
5092 
5093   /**
5094    * A version of getRowLock(byte[], boolean) to use when a region operation has already been
5095    * started (the calling thread has already acquired the region-close-guard lock).
5096    */
5097   protected RowLock getRowLockInternal(byte[] row, boolean waitForLock) throws IOException {
5098     checkRow(row, "row lock");
5099     HashedBytes rowKey = new HashedBytes(row);
5100     RowLockContext rowLockContext = new RowLockContext(rowKey);
5101 
5102     // loop until we acquire the row lock (unless !waitForLock)
5103     while (true) {
5104       RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
5105       if (existingContext == null) {
5106         // Row is not already locked by any thread, use newly created context.
5107         break;
5108       } else if (existingContext.ownedByCurrentThread()) {
5109         // Row is already locked by current thread, reuse existing context instead.
5110         rowLockContext = existingContext;
5111         break;
5112       } else {
5113         if (!waitForLock) {
5114           return null;
5115         }
5116         TraceScope traceScope = null;
5117         try {
5118           if (Trace.isTracing()) {
5119             traceScope = Trace.startSpan("HRegion.getRowLockInternal");
5120           }
5121           // Row is already locked by some other thread, give up or wait for it
5122           if (!existingContext.latch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
5123             if(traceScope != null) {
5124               traceScope.getSpan().addTimelineAnnotation("Failed to get row lock");
5125             }
5126             throw new IOException("Timed out waiting for lock for row: " + rowKey);
5127           }
5128           if (traceScope != null) traceScope.close();
5129           traceScope = null;
5130         } catch (InterruptedException ie) {
5131           LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
5132           InterruptedIOException iie = new InterruptedIOException();
5133           iie.initCause(ie);
5134           throw iie;
5135         } finally {
5136           if (traceScope != null) traceScope.close();
5137         }
5138       }
5139     }
5140 
5141     // allocate new lock for this thread
5142     return rowLockContext.newLock();
5143   }
5144 
5145   /**
5146    * Acquires a lock on the given row.
5147    * The same thread may acquire multiple locks on the same row.
5148    * @return the acquired row lock
5149    * @throws IOException if the lock could not be acquired after waiting
5150    */
5151   public RowLock getRowLock(byte[] row) throws IOException {
5152     return getRowLock(row, true);
5153   }
5154 
5155   /**
5156    * If the given list of row locks is not null, releases all locks.
5157    */
5158   public void releaseRowLocks(List<RowLock> rowLocks) {
5159     if (rowLocks != null) {
5160       for (RowLock rowLock : rowLocks) {
5161         rowLock.release();
5162       }
5163       rowLocks.clear();
5164     }
5165   }
5166 
5167   /**
5168    * Determines whether multiple column families are present
5169    * Precondition: familyPaths is not null
5170    *
5171    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
5172    */
5173   private static boolean hasMultipleColumnFamilies(
5174       List<Pair<byte[], String>> familyPaths) {
5175     boolean multipleFamilies = false;
5176     byte[] family = null;
5177     for (Pair<byte[], String> pair : familyPaths) {
5178       byte[] fam = pair.getFirst();
5179       if (family == null) {
5180         family = fam;
5181       } else if (!Bytes.equals(family, fam)) {
5182         multipleFamilies = true;
5183         break;
5184       }
5185     }
5186     return multipleFamilies;
5187   }
5188 
5189   /**
5190    * Bulk load a/many HFiles into this region
5191    *
5192    * @param familyPaths A list which maps column families to the location of the HFile to load
5193    *                    into that column family region.
5194    * @param assignSeqId Force a flush, get it's sequenceId to preserve the guarantee that all the
5195    *                    edits lower than the highest sequential ID from all the HFiles are flushed
5196    *                    on disk.
5197    * @return true if successful, false if failed recoverably
5198    * @throws IOException if failed unrecoverably.
5199    */
5200   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths,
5201                                 boolean assignSeqId) throws IOException {
5202     return bulkLoadHFiles(familyPaths, assignSeqId, null);
5203   }
5204 
5205   /**
5206    * Attempts to atomically load a group of hfiles.  This is critical for loading
5207    * rows with multiple column families atomically.
5208    *
5209    * @param familyPaths      List of Pair<byte[] column family, String hfilePath>
5210    * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
5211    *                         file about to be bulk loaded
5212    * @param assignSeqId      Force a flush, get it's sequenceId to preserve the guarantee that
5213    *                         all the edits lower than the highest sequential ID from all the
5214    *                         HFiles are flushed on disk.
5215    * @return true if successful, false if failed recoverably
5216    * @throws IOException if failed unrecoverably.
5217    */
5218   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths, boolean assignSeqId,
5219       BulkLoadListener bulkLoadListener) throws IOException {
5220     long seqId = -1;
5221     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
5222     Preconditions.checkNotNull(familyPaths);
5223     // we need writeLock for multi-family bulk load
5224     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
5225     try {
5226       this.writeRequestsCount.increment();
5227 
5228       // There possibly was a split that happened between when the split keys
5229       // were gathered and before the HRegion's write lock was taken.  We need
5230       // to validate the HFile region before attempting to bulk load all of them
5231       List<IOException> ioes = new ArrayList<IOException>();
5232       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
5233       for (Pair<byte[], String> p : familyPaths) {
5234         byte[] familyName = p.getFirst();
5235         String path = p.getSecond();
5236 
5237         Store store = getStore(familyName);
5238         if (store == null) {
5239           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
5240               "No such column family " + Bytes.toStringBinary(familyName));
5241           ioes.add(ioe);
5242         } else {
5243           try {
5244             store.assertBulkLoadHFileOk(new Path(path));
5245           } catch (WrongRegionException wre) {
5246             // recoverable (file doesn't fit in region)
5247             failures.add(p);
5248           } catch (IOException ioe) {
5249             // unrecoverable (hdfs problem)
5250             ioes.add(ioe);
5251           }
5252         }
5253       }
5254 
5255       // validation failed because of some sort of IO problem.
5256       if (ioes.size() != 0) {
5257         IOException e = MultipleIOException.createIOException(ioes);
5258         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
5259         throw e;
5260       }
5261 
5262       // validation failed, bail out before doing anything permanent.
5263       if (failures.size() != 0) {
5264         StringBuilder list = new StringBuilder();
5265         for (Pair<byte[], String> p : failures) {
5266           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
5267               .append(p.getSecond());
5268         }
5269         // problem when validating
5270         LOG.warn("There was a recoverable bulk load failure likely due to a" +
5271             " split.  These (family, HFile) pairs were not loaded: " + list);
5272         return false;
5273       }
5274 
5275       // We need to assign a sequential ID that's in between two memstores in order to preserve
5276       // the guarantee that all the edits lower than the highest sequential ID from all the
5277       // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
5278       // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
5279       // a sequence id that we can be sure is beyond the last hfile written).
5280       if (assignSeqId) {
5281         FlushResult fs = this.flushcache();
5282         if (fs.isFlushSucceeded()) {
5283           seqId = fs.flushSequenceId;
5284         } else if (fs.result == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
5285           seqId = fs.flushSequenceId;
5286         } else {
5287           throw new IOException("Could not bulk load with an assigned sequential ID because the " +
5288               "flush didn't run. Reason for not flushing: " + fs.failureReason);
5289         }
5290       }
5291 
5292       for (Pair<byte[], String> p : familyPaths) {
5293         byte[] familyName = p.getFirst();
5294         String path = p.getSecond();
5295         Store store = getStore(familyName);
5296         try {
5297           String finalPath = path;
5298           if (bulkLoadListener != null) {
5299             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
5300           }
5301           Path commitedStoreFile = store.bulkLoadHFile(finalPath, seqId);
5302 
5303           if(storeFiles.containsKey(familyName)) {
5304             storeFiles.get(familyName).add(commitedStoreFile);
5305           } else {
5306             List<Path> storeFileNames = new ArrayList<Path>();
5307             storeFileNames.add(commitedStoreFile);
5308             storeFiles.put(familyName, storeFileNames);
5309           }
5310           if (bulkLoadListener != null) {
5311             bulkLoadListener.doneBulkLoad(familyName, path);
5312           }
5313         } catch (IOException ioe) {
5314           // A failure here can cause an atomicity violation that we currently
5315           // cannot recover from since it is likely a failed HDFS operation.
5316 
5317           // TODO Need a better story for reverting partial failures due to HDFS.
5318           LOG.error("There was a partial failure due to IO when attempting to" +
5319               " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
5320           if (bulkLoadListener != null) {
5321             try {
5322               bulkLoadListener.failedBulkLoad(familyName, path);
5323             } catch (Exception ex) {
5324               LOG.error("Error while calling failedBulkLoad for family " +
5325                   Bytes.toString(familyName) + " with path " + path, ex);
5326             }
5327           }
5328           throw ioe;
5329         }
5330       }
5331 
5332       return true;
5333     } finally {
5334       if (wal != null && !storeFiles.isEmpty()) {
5335         // write a bulk load event when not all hfiles are loaded
5336         try {
5337           WALProtos.BulkLoadDescriptor loadDescriptor = ProtobufUtil.toBulkLoadDescriptor(
5338               this.getRegionInfo().getTable(),
5339               ByteStringer.wrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles, seqId);
5340           WALUtil.writeBulkLoadMarkerAndSync(wal, this.htableDescriptor, getRegionInfo(),
5341               loadDescriptor, sequenceId);
5342         } catch (IOException ioe) {
5343           if (this.rsServices != null) {
5344             // Have to abort region server because some hfiles has been loaded but we can't write
5345             // the event into WAL
5346             this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
5347           }
5348         }
5349       }
5350 
5351       closeBulkRegionOperation();
5352     }
5353   }
5354 
5355   @Override
5356   public boolean equals(Object o) {
5357     return o instanceof HRegion && Bytes.equals(this.getRegionName(),
5358                                                 ((HRegion) o).getRegionName());
5359   }
5360 
5361   @Override
5362   public int hashCode() {
5363     return Bytes.hashCode(this.getRegionName());
5364   }
5365 
5366   @Override
5367   public String toString() {
5368     return this.getRegionNameAsString();
5369   }
5370 
5371   /**
5372    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
5373    */
5374   class RegionScannerImpl implements RegionScanner {
5375     // Package local for testability
5376     KeyValueHeap storeHeap = null;
5377     /** Heap of key-values that are not essential for the provided filters and are thus read
5378      * on demand, if on-demand column family loading is enabled.*/
5379     KeyValueHeap joinedHeap = null;
5380     /**
5381      * If the joined heap data gathering is interrupted due to scan limits, this will
5382      * contain the row for which we are populating the values.*/
5383     protected Cell joinedContinuationRow = null;
5384     protected final byte[] stopRow;
5385     private final FilterWrapper filter;
5386     private int batch;
5387     protected int isScan;
5388     private boolean filterClosed = false;
5389     private long readPt;
5390     private long maxResultSize;
5391     protected HRegion region;
5392 
5393     @Override
5394     public HRegionInfo getRegionInfo() {
5395       return region.getRegionInfo();
5396     }
5397 
5398     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
5399         throws IOException {
5400 
5401       this.region = region;
5402       this.maxResultSize = scan.getMaxResultSize();
5403       if (scan.hasFilter()) {
5404         this.filter = new FilterWrapper(scan.getFilter());
5405       } else {
5406         this.filter = null;
5407       }
5408 
5409       this.batch = scan.getBatch();
5410       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
5411         this.stopRow = null;
5412       } else {
5413         this.stopRow = scan.getStopRow();
5414       }
5415       // If we are doing a get, we want to be [startRow,endRow] normally
5416       // it is [startRow,endRow) and if startRow=endRow we get nothing.
5417       this.isScan = scan.isGetScan() ? -1 : 0;
5418 
5419       // synchronize on scannerReadPoints so that nobody calculates
5420       // getSmallestReadPoint, before scannerReadPoints is updated.
5421       IsolationLevel isolationLevel = scan.getIsolationLevel();
5422       synchronized(scannerReadPoints) {
5423         this.readPt = getReadpoint(isolationLevel);
5424         scannerReadPoints.put(this, this.readPt);
5425       }
5426 
5427       // Here we separate all scanners into two lists - scanner that provide data required
5428       // by the filter to operate (scanners list) and all others (joinedScanners list).
5429       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>();
5430       List<KeyValueScanner> joinedScanners = new ArrayList<KeyValueScanner>();
5431       if (additionalScanners != null) {
5432         scanners.addAll(additionalScanners);
5433       }
5434 
5435       for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
5436           scan.getFamilyMap().entrySet()) {
5437         Store store = stores.get(entry.getKey());
5438         KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt);
5439         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
5440           || this.filter.isFamilyEssential(entry.getKey())) {
5441           scanners.add(scanner);
5442         } else {
5443           joinedScanners.add(scanner);
5444         }
5445       }
5446       initializeKVHeap(scanners, joinedScanners, region);
5447     }
5448 
5449     protected void initializeKVHeap(List<KeyValueScanner> scanners,
5450         List<KeyValueScanner> joinedScanners, HRegion region)
5451         throws IOException {
5452       this.storeHeap = new KeyValueHeap(scanners, region.comparator);
5453       if (!joinedScanners.isEmpty()) {
5454         this.joinedHeap = new KeyValueHeap(joinedScanners, region.comparator);
5455       }
5456     }
5457 
5458     @Override
5459     public long getMaxResultSize() {
5460       return maxResultSize;
5461     }
5462 
5463     @Override
5464     public long getMvccReadPoint() {
5465       return this.readPt;
5466     }
5467 
5468     @Override
5469     public int getBatch() {
5470       return this.batch;
5471     }
5472 
5473     /**
5474      * Reset both the filter and the old filter.
5475      *
5476      * @throws IOException in case a filter raises an I/O exception.
5477      */
5478     protected void resetFilters() throws IOException {
5479       if (filter != null) {
5480         filter.reset();
5481       }
5482     }
5483 
5484     @Override
5485     public NextState next(List<Cell> outResults)
5486         throws IOException {
5487       // apply the batching limit by default
5488       return next(outResults, batch);
5489     }
5490 
5491     @Override
5492     public NextState next(List<Cell> outResults, int limit) throws IOException {
5493       return next(outResults, limit, -1);
5494     }
5495 
5496     @Override
5497     public synchronized NextState next(List<Cell> outResults, int limit, long remainingResultSize)
5498         throws IOException {
5499       if (this.filterClosed) {
5500         throw new UnknownScannerException("Scanner was closed (timed out?) " +
5501             "after we renewed it. Could be caused by a very slow scanner " +
5502             "or a lengthy garbage collection");
5503       }
5504       startRegionOperation(Operation.SCAN);
5505       readRequestsCount.increment();
5506       try {
5507         return nextRaw(outResults, limit, remainingResultSize);
5508       } finally {
5509         closeRegionOperation(Operation.SCAN);
5510       }
5511     }
5512 
5513     @Override
5514     public NextState nextRaw(List<Cell> outResults) throws IOException {
5515       return nextRaw(outResults, batch);
5516     }
5517 
5518     @Override
5519     public NextState nextRaw(List<Cell> outResults, int limit)
5520         throws IOException {
5521       return nextRaw(outResults, limit, -1);
5522     }
5523 
5524     @Override
5525     public NextState nextRaw(List<Cell> outResults, int batchLimit, long remainingResultSize)
5526         throws IOException {
5527       if (storeHeap == null) {
5528         // scanner is closed
5529         throw new UnknownScannerException("Scanner was closed");
5530       }
5531       NextState state;
5532       if (outResults.isEmpty()) {
5533         // Usually outResults is empty. This is true when next is called
5534         // to handle scan or get operation.
5535         state = nextInternal(outResults, batchLimit, remainingResultSize);
5536       } else {
5537         List<Cell> tmpList = new ArrayList<Cell>();
5538         state = nextInternal(tmpList, batchLimit, remainingResultSize);
5539         outResults.addAll(tmpList);
5540       }
5541       // State should never be null, this is a precautionary measure
5542       if (state == null) {
5543         if (LOG.isTraceEnabled()) LOG.trace("State was null. Defaulting to no more values state");
5544         state = NextState.makeState(NextState.State.NO_MORE_VALUES);
5545       }
5546 
5547       resetFilters();
5548       if (isFilterDoneInternal()) {
5549         state = NextState.makeState(NextState.State.NO_MORE_VALUES, state.getResultSize());
5550       }
5551       return state;
5552     }
5553 
5554     /**
5555      * @return the state the joinedHeap returned on the call to
5556      *         {@link KeyValueHeap#next(List, int, long)}
5557      */
5558     private NextState populateFromJoinedHeap(List<Cell> results, int limit, long resultSize)
5559             throws IOException {
5560       assert joinedContinuationRow != null;
5561       NextState state =
5562           populateResult(results, this.joinedHeap, limit, resultSize,
5563           joinedContinuationRow.getRowArray(), joinedContinuationRow.getRowOffset(),
5564           joinedContinuationRow.getRowLength());
5565       if (state != null && !state.batchLimitReached() && !state.sizeLimitReached()) {
5566         // We are done with this row, reset the continuation.
5567         joinedContinuationRow = null;
5568       }
5569       // As the data is obtained from two independent heaps, we need to
5570       // ensure that result list is sorted, because Result relies on that.
5571       Collections.sort(results, comparator);
5572       return state;
5573     }
5574 
5575     /**
5576      * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
5577      * reached, or remainingResultSize (if not -1) is reaced
5578      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
5579      * @param remainingResultSize The remaining space within our result size limit. A negative value
5580      *          indicate no limit
5581      * @param batchLimit Max amount of KVs to place in result list, -1 means no limit.
5582      * @param currentRow Byte array with key we are fetching.
5583      * @param offset offset for currentRow
5584      * @param length length for currentRow
5585      * @return state of last call to {@link KeyValueHeap#next()}
5586      */
5587     private NextState populateResult(List<Cell> results, KeyValueHeap heap, int batchLimit,
5588         long remainingResultSize, byte[] currentRow, int offset, short length) throws IOException {
5589       Cell nextKv;
5590       boolean moreCellsInRow = false;
5591       long accumulatedResultSize = 0;
5592       List<Cell> tmpResults = new ArrayList<Cell>();
5593       do {
5594         int remainingBatchLimit = batchLimit - results.size();
5595         NextState heapState =
5596             heap.next(tmpResults, remainingBatchLimit, remainingResultSize - accumulatedResultSize);
5597         results.addAll(tmpResults);
5598         accumulatedResultSize += calculateResultSize(tmpResults, heapState);
5599         tmpResults.clear();
5600 
5601         if (batchLimit > 0 && results.size() == batchLimit) {
5602           return NextState.makeState(NextState.State.BATCH_LIMIT_REACHED, accumulatedResultSize);
5603         }
5604 
5605         nextKv = heap.peek();
5606         moreCellsInRow = moreCellsInRow(nextKv, currentRow, offset, length);
5607         boolean sizeLimitReached =
5608             remainingResultSize > 0 && accumulatedResultSize >= remainingResultSize;
5609         if (moreCellsInRow && sizeLimitReached) {
5610           return NextState.makeState(NextState.State.SIZE_LIMIT_REACHED, accumulatedResultSize);
5611         }
5612       } while (moreCellsInRow);
5613 
5614       if (nextKv != null) {
5615         return NextState.makeState(NextState.State.MORE_VALUES, accumulatedResultSize);
5616       } else {
5617         return NextState.makeState(NextState.State.NO_MORE_VALUES, accumulatedResultSize);
5618       }
5619     }
5620 
5621     /**
5622      * Based on the nextKv in the heap, and the current row, decide whether or not there are more
5623      * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
5624      * then there are more cells to be read in the row.
5625      * @param nextKv
5626      * @param currentRow
5627      * @param offset
5628      * @param length
5629      * @return true When there are more cells in the row to be read
5630      */
5631     private boolean moreCellsInRow(final Cell nextKv, byte[] currentRow, int offset,
5632         short length) {
5633       return nextKv != null && CellUtil.matchingRow(nextKv, currentRow, offset, length);
5634     }
5635 
5636     /**
5637      * Calculates the size of the results. If the state of the scanner that these results came from
5638      * indicates that an estimate of the result size has already been generated, we can skip the
5639      * calculation and use that instead.
5640      * @param results List of cells we want to calculate size of
5641      * @param state The state returned from the scanner that generated these results
5642      * @return aggregate size of results
5643      */
5644     private long calculateResultSize(List<Cell> results, NextState state) {
5645       if (results == null || results.isEmpty()) return 0;
5646 
5647       // In general, the state should contain the estimate because the result size used to
5648       // determine when the scan has exceeded its size limit. If the estimate is contained in the
5649       // state then we can avoid an unnecesasry calculation.
5650       if (state != null && state.hasResultSizeEstimate()) return state.getResultSize();
5651 
5652       long size = 0;
5653       for (Cell c : results) {
5654         size += CellUtil.estimatedHeapSizeOf(c);
5655       }
5656 
5657       return size;
5658     }
5659 
5660     /*
5661      * @return True if a filter rules the scanner is over, done.
5662      */
5663     @Override
5664     public synchronized boolean isFilterDone() throws IOException {
5665       return isFilterDoneInternal();
5666     }
5667 
5668     private boolean isFilterDoneInternal() throws IOException {
5669       return this.filter != null && this.filter.filterAllRemaining();
5670     }
5671 
5672     private NextState nextInternal(List<Cell> results, int batchLimit, long remainingResultSize)
5673         throws IOException {
5674       if (!results.isEmpty()) {
5675         throw new IllegalArgumentException("First parameter should be an empty list");
5676       }
5677       // Estimate of the size (heap size) of the results returned from this method
5678       long resultSize = 0;
5679       RpcCallContext rpcCall = RpcServer.getCurrentCall();
5680       // The loop here is used only when at some point during the next we determine
5681       // that due to effects of filters or otherwise, we have an empty row in the result.
5682       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
5683       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
5684       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
5685       while (true) {
5686         if (rpcCall != null) {
5687           // If a user specifies a too-restrictive or too-slow scanner, the
5688           // client might time out and disconnect while the server side
5689           // is still processing the request. We should abort aggressively
5690           // in that case.
5691           long afterTime = rpcCall.disconnectSince();
5692           if (afterTime >= 0) {
5693             throw new CallerDisconnectedException(
5694                 "Aborting on region " + getRegionNameAsString() + ", call " +
5695                     this + " after " + afterTime + " ms, since " +
5696                     "caller disconnected");
5697           }
5698         }
5699 
5700         // Let's see what we have in the storeHeap.
5701         Cell current = this.storeHeap.peek();
5702 
5703         byte[] currentRow = null;
5704         int offset = 0;
5705         short length = 0;
5706         if (current != null) {
5707           currentRow = current.getRowArray();
5708           offset = current.getRowOffset();
5709           length = current.getRowLength();
5710         }
5711 
5712         boolean stopRow = isStopRow(currentRow, offset, length);
5713         boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();
5714 
5715         // If filter#hasFilterRow is true, partial results are not allowed since allowing them
5716         // would prevent the filters from being evaluated. Thus, if it is true, change the
5717         // remainingResultSize to -1 so that the entire row's worth of cells are fetched.
5718         if (hasFilterRow && remainingResultSize > 0) {
5719           remainingResultSize = -1;
5720           if (LOG.isTraceEnabled()) {
5721             LOG.trace("filter#hasFilterRow is true which prevents partial results from being " +
5722                 " formed. The remainingResultSize of: " + remainingResultSize + " will not " +
5723                 " be considered when fetching the cells for this row.");
5724           }
5725         }
5726 
5727         NextState joinedHeapState;
5728         // Check if we were getting data from the joinedHeap and hit the limit.
5729         // If not, then it's main path - getting results from storeHeap.
5730         if (joinedContinuationRow == null) {
5731           // First, check if we are at a stop row. If so, there are no more results.
5732           if (stopRow) {
5733             if (hasFilterRow) {
5734               filter.filterRowCells(results);
5735             }
5736             return NextState.makeState(NextState.State.NO_MORE_VALUES, resultSize);
5737           }
5738 
5739           // Check if rowkey filter wants to exclude this row. If so, loop to next.
5740           // Technically, if we hit limits before on this row, we don't need this call.
5741           if (filterRowKey(currentRow, offset, length)) {
5742             boolean moreRows = nextRow(currentRow, offset, length);
5743             if (!moreRows) return NextState.makeState(NextState.State.NO_MORE_VALUES, resultSize);
5744             results.clear();
5745             continue;
5746           }
5747 
5748           NextState storeHeapState =
5749               populateResult(results, this.storeHeap, batchLimit, remainingResultSize, currentRow,
5750                 offset, length);
5751           resultSize += calculateResultSize(results, storeHeapState);
5752           // Invalid states should never be returned. If one is seen, throw exception
5753           // since we have no way of telling how we should proceed
5754           if (!NextState.isValidState(storeHeapState)) {
5755             throw new IOException("NextState returned from call storeHeap was invalid");
5756           }
5757 
5758           // Ok, we are good, let's try to get some results from the main heap.
5759           if (storeHeapState.batchLimitReached()) {
5760             if (hasFilterRow) {
5761               throw new IncompatibleFilterException(
5762                 "Filter whose hasFilterRow() returns true is incompatible with scan with limit!");
5763             }
5764             // We hit the batch limit.
5765             return NextState.makeState(NextState.State.BATCH_LIMIT_REACHED, resultSize);
5766           } else if (storeHeapState.sizeLimitReached()) {
5767             if (hasFilterRow) {
5768               // We try to guard against this case above when remainingResultSize is set to -1 if
5769               // hasFilterRow is true. In the even that the guard doesn't work, an exception must be
5770               // thrown
5771               throw new IncompatibleFilterException(
5772                   "Filter whose hasFilterRows() returns true is incompatible with scans that"
5773                       + " return partial results");
5774             }
5775             // We hit the size limit.
5776             return NextState.makeState(NextState.State.SIZE_LIMIT_REACHED, resultSize);
5777           }
5778           Cell nextKv = this.storeHeap.peek();
5779           stopRow = nextKv == null ||
5780               isStopRow(nextKv.getRowArray(), nextKv.getRowOffset(), nextKv.getRowLength());
5781           // save that the row was empty before filters applied to it.
5782           final boolean isEmptyRow = results.isEmpty();
5783 
5784           // We have the part of the row necessary for filtering (all of it, usually).
5785           // First filter with the filterRow(List).
5786           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
5787           if (hasFilterRow) {
5788             ret = filter.filterRowCellsWithRet(results);
5789           }
5790 
5791           if ((isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE) || filterRow()) {
5792             results.clear();
5793             boolean moreRows = nextRow(currentRow, offset, length);
5794             if (!moreRows) return NextState.makeState(NextState.State.NO_MORE_VALUES, 0);
5795 
5796             // This row was totally filtered out, if this is NOT the last row,
5797             // we should continue on. Otherwise, nothing else to do.
5798             if (!stopRow) continue;
5799             return NextState.makeState(NextState.State.NO_MORE_VALUES, 0);
5800           }
5801 
5802           // Ok, we are done with storeHeap for this row.
5803           // Now we may need to fetch additional, non-essential data into row.
5804           // These values are not needed for filter to work, so we postpone their
5805           // fetch to (possibly) reduce amount of data loads from disk.
5806           if (this.joinedHeap != null) {
5807             Cell nextJoinedKv = joinedHeap.peek();
5808             // If joinedHeap is pointing to some other row, try to seek to a correct one.
5809             boolean mayHaveData = (nextJoinedKv != null && CellUtil.matchingRow(nextJoinedKv,
5810                 currentRow, offset, length))
5811                 || (this.joinedHeap.requestSeek(
5812                     KeyValueUtil.createFirstOnRow(currentRow, offset, length), true, true)
5813                     && joinedHeap.peek() != null && CellUtil.matchingRow(joinedHeap.peek(),
5814                     currentRow, offset, length));
5815             if (mayHaveData) {
5816               joinedContinuationRow = current;
5817               joinedHeapState =
5818                   populateFromJoinedHeap(results, batchLimit, remainingResultSize - resultSize);
5819               resultSize +=
5820                   joinedHeapState != null && joinedHeapState.hasResultSizeEstimate() ?
5821                       joinedHeapState.getResultSize() : 0;
5822               if (joinedHeapState != null && joinedHeapState.sizeLimitReached()) {
5823                 return NextState.makeState(NextState.State.SIZE_LIMIT_REACHED, resultSize);
5824               }
5825             }
5826           }
5827         } else {
5828           // Populating from the joined heap was stopped by limits, populate some more.
5829           joinedHeapState =
5830               populateFromJoinedHeap(results, batchLimit, remainingResultSize - resultSize);
5831           resultSize +=
5832               joinedHeapState != null && joinedHeapState.hasResultSizeEstimate() ?
5833                   joinedHeapState.getResultSize() : 0;
5834           if (joinedHeapState != null && joinedHeapState.sizeLimitReached()) {
5835             return NextState.makeState(NextState.State.SIZE_LIMIT_REACHED, resultSize);
5836           }
5837         }
5838         // We may have just called populateFromJoinedMap and hit the limits. If that is
5839         // the case, we need to call it again on the next next() invocation.
5840         if (joinedContinuationRow != null) {
5841           return NextState.makeState(NextState.State.MORE_VALUES, resultSize);
5842         }
5843 
5844         // Finally, we are done with both joinedHeap and storeHeap.
5845         // Double check to prevent empty rows from appearing in result. It could be
5846         // the case when SingleColumnValueExcludeFilter is used.
5847         if (results.isEmpty()) {
5848           boolean moreRows = nextRow(currentRow, offset, length);
5849           if (!moreRows) return NextState.makeState(NextState.State.NO_MORE_VALUES, 0);
5850           if (!stopRow) continue;
5851         }
5852 
5853         // We are done. Return the result.
5854         if (stopRow) {
5855           return NextState.makeState(NextState.State.NO_MORE_VALUES, resultSize);
5856         } else {
5857           return NextState.makeState(NextState.State.MORE_VALUES, resultSize);
5858         }
5859       }
5860     }
5861 
5862     /**
5863      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
5864      * both filterRow & filterRow(List<KeyValue> kvs) functions. While 0.94 code or older, it may
5865      * not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only returns
5866      * true when filterRow(List<KeyValue> kvs) is overridden not the filterRow(). Therefore, the
5867      * filterRow() will be skipped.
5868      */
5869     private boolean filterRow() throws IOException {
5870       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
5871       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
5872       return filter != null && (!filter.hasFilterRow())
5873           && filter.filterRow();
5874     }
5875 
5876     private boolean filterRowKey(byte[] row, int offset, short length) throws IOException {
5877       return filter != null
5878           && filter.filterRowKey(row, offset, length);
5879     }
5880 
5881     protected boolean nextRow(byte [] currentRow, int offset, short length) throws IOException {
5882       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
5883       Cell next;
5884       while ((next = this.storeHeap.peek()) != null &&
5885              CellUtil.matchingRow(next, currentRow, offset, length)) {
5886         this.storeHeap.next(MOCKED_LIST);
5887       }
5888       resetFilters();
5889       // Calling the hook in CP which allows it to do a fast forward
5890       return this.region.getCoprocessorHost() == null
5891           || this.region.getCoprocessorHost()
5892               .postScannerFilterRow(this, currentRow, offset, length);
5893     }
5894 
5895     protected boolean isStopRow(byte[] currentRow, int offset, short length) {
5896       return currentRow == null ||
5897           (stopRow != null &&
5898           comparator.compareRows(stopRow, 0, stopRow.length,
5899             currentRow, offset, length) <= isScan);
5900     }
5901 
5902     @Override
5903     public synchronized void close() {
5904       if (storeHeap != null) {
5905         storeHeap.close();
5906         storeHeap = null;
5907       }
5908       if (joinedHeap != null) {
5909         joinedHeap.close();
5910         joinedHeap = null;
5911       }
5912       // no need to synchronize here.
5913       scannerReadPoints.remove(this);
5914       this.filterClosed = true;
5915     }
5916 
5917     KeyValueHeap getStoreHeapForTesting() {
5918       return storeHeap;
5919     }
5920 
5921     @Override
5922     public synchronized boolean reseek(byte[] row) throws IOException {
5923       if (row == null) {
5924         throw new IllegalArgumentException("Row cannot be null.");
5925       }
5926       boolean result = false;
5927       startRegionOperation();
5928       try {
5929         KeyValue kv = KeyValueUtil.createFirstOnRow(row);
5930         // use request seek to make use of the lazy seek option. See HBASE-5520
5931         result = this.storeHeap.requestSeek(kv, true, true);
5932         if (this.joinedHeap != null) {
5933           result = this.joinedHeap.requestSeek(kv, true, true) || result;
5934         }
5935       } finally {
5936         closeRegionOperation();
5937       }
5938       return result;
5939     }
5940   }
5941 
5942   // Utility methods
5943   /**
5944    * A utility method to create new instances of HRegion based on the
5945    * {@link HConstants#REGION_IMPL} configuration property.
5946    * @param tableDir qualified path of directory where region should be located,
5947    * usually the table directory.
5948    * @param wal The WAL is the outbound log for any updates to the HRegion
5949    * The wal file is a logfile from the previous execution that's
5950    * custom-computed for this HRegion. The HRegionServer computes and sorts the
5951    * appropriate wal info for this HRegion. If there is a previous file
5952    * (implying that the HRegion has been written-to before), then read it from
5953    * the supplied path.
5954    * @param fs is the filesystem.
5955    * @param conf is global configuration settings.
5956    * @param regionInfo - HRegionInfo that describes the region
5957    * is new), then read them from the supplied path.
5958    * @param htd the table descriptor
5959    * @return the new instance
5960    */
5961   static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
5962       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
5963       RegionServerServices rsServices) {
5964     try {
5965       @SuppressWarnings("unchecked")
5966       Class<? extends HRegion> regionClass =
5967           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
5968 
5969       Constructor<? extends HRegion> c =
5970           regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
5971               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
5972               RegionServerServices.class);
5973 
5974       return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
5975     } catch (Throwable e) {
5976       // todo: what should I throw here?
5977       throw new IllegalStateException("Could not instantiate a region instance.", e);
5978     }
5979   }
5980 
5981   /**
5982    * Convenience method creating new HRegions. Used by createTable.
5983    *
5984    * @param info Info for region to create.
5985    * @param rootDir Root directory for HBase instance
5986    * @param wal shared WAL
5987    * @param initialize - true to initialize the region
5988    * @return new HRegion
5989    * @throws IOException
5990    */
5991   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
5992                                       final Configuration conf,
5993                                       final HTableDescriptor hTableDescriptor,
5994                                       final WAL wal,
5995                                       final boolean initialize)
5996       throws IOException {
5997     LOG.info("creating HRegion " + info.getTable().getNameAsString()
5998         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
5999         " Table name == " + info.getTable().getNameAsString());
6000     FileSystem fs = FileSystem.get(conf);
6001     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6002     HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
6003     HRegion region = HRegion.newHRegion(tableDir,
6004         wal, fs, conf, info, hTableDescriptor, null);
6005     if (initialize) {
6006       // If initializing, set the sequenceId. It is also required by WALPerformanceEvaluation when
6007       // verifying the WALEdits.
6008       region.setSequenceId(region.initialize(null));
6009     }
6010     return region;
6011   }
6012 
6013   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6014                                       final Configuration conf,
6015                                       final HTableDescriptor hTableDescriptor,
6016                                       final WAL wal)
6017     throws IOException {
6018     return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
6019   }
6020 
6021 
6022   /**
6023    * Open a Region.
6024    * @param info Info for region to be opened.
6025    * @param wal WAL for region to use. This method will call
6026    * WAL#setSequenceNumber(long) passing the result of the call to
6027    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6028    * up.  HRegionStore does this every time it opens a new region.
6029    * @return new HRegion
6030    *
6031    * @throws IOException
6032    */
6033   public static HRegion openHRegion(final HRegionInfo info,
6034       final HTableDescriptor htd, final WAL wal,
6035       final Configuration conf)
6036   throws IOException {
6037     return openHRegion(info, htd, wal, conf, null, null);
6038   }
6039 
6040   /**
6041    * Open a Region.
6042    * @param info Info for region to be opened
6043    * @param htd the table descriptor
6044    * @param wal WAL for region to use. This method will call
6045    * WAL#setSequenceNumber(long) passing the result of the call to
6046    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6047    * up.  HRegionStore does this every time it opens a new region.
6048    * @param conf The Configuration object to use.
6049    * @param rsServices An interface we can request flushes against.
6050    * @param reporter An interface we can report progress against.
6051    * @return new HRegion
6052    *
6053    * @throws IOException
6054    */
6055   public static HRegion openHRegion(final HRegionInfo info,
6056     final HTableDescriptor htd, final WAL wal, final Configuration conf,
6057     final RegionServerServices rsServices,
6058     final CancelableProgressable reporter)
6059   throws IOException {
6060     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
6061   }
6062 
6063   /**
6064    * Open a Region.
6065    * @param rootDir Root directory for HBase instance
6066    * @param info Info for region to be opened.
6067    * @param htd the table descriptor
6068    * @param wal WAL for region to use. This method will call
6069    * WAL#setSequenceNumber(long) passing the result of the call to
6070    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6071    * up.  HRegionStore does this every time it opens a new region.
6072    * @param conf The Configuration object to use.
6073    * @return new HRegion
6074    * @throws IOException
6075    */
6076   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
6077       final HTableDescriptor htd, final WAL wal, final Configuration conf)
6078   throws IOException {
6079     return openHRegion(rootDir, info, htd, wal, conf, null, null);
6080   }
6081 
6082   /**
6083    * Open a Region.
6084    * @param rootDir Root directory for HBase instance
6085    * @param info Info for region to be opened.
6086    * @param htd the table descriptor
6087    * @param wal WAL for region to use. This method will call
6088    * WAL#setSequenceNumber(long) passing the result of the call to
6089    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6090    * up.  HRegionStore does this every time it opens a new region.
6091    * @param conf The Configuration object to use.
6092    * @param rsServices An interface we can request flushes against.
6093    * @param reporter An interface we can report progress against.
6094    * @return new HRegion
6095    * @throws IOException
6096    */
6097   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
6098       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6099       final RegionServerServices rsServices,
6100       final CancelableProgressable reporter)
6101   throws IOException {
6102     FileSystem fs = null;
6103     if (rsServices != null) {
6104       fs = rsServices.getFileSystem();
6105     }
6106     if (fs == null) {
6107       fs = FileSystem.get(conf);
6108     }
6109     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
6110   }
6111 
6112   /**
6113    * Open a Region.
6114    * @param conf The Configuration object to use.
6115    * @param fs Filesystem to use
6116    * @param rootDir Root directory for HBase instance
6117    * @param info Info for region to be opened.
6118    * @param htd the table descriptor
6119    * @param wal WAL for region to use. This method will call
6120    * WAL#setSequenceNumber(long) passing the result of the call to
6121    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6122    * up.  HRegionStore does this every time it opens a new region.
6123    * @return new HRegion
6124    * @throws IOException
6125    */
6126   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6127       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal)
6128       throws IOException {
6129     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
6130   }
6131 
6132   /**
6133    * Open a Region.
6134    * @param conf The Configuration object to use.
6135    * @param fs Filesystem to use
6136    * @param rootDir Root directory for HBase instance
6137    * @param info Info for region to be opened.
6138    * @param htd the table descriptor
6139    * @param wal WAL for region to use. This method will call
6140    * WAL#setSequenceNumber(long) passing the result of the call to
6141    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6142    * up.  HRegionStore does this every time it opens a new region.
6143    * @param rsServices An interface we can request flushes against.
6144    * @param reporter An interface we can report progress against.
6145    * @return new HRegion
6146    * @throws IOException
6147    */
6148   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6149       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal,
6150       final RegionServerServices rsServices, final CancelableProgressable reporter)
6151       throws IOException {
6152     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6153     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
6154   }
6155 
6156   /**
6157    * Open a Region.
6158    * @param conf The Configuration object to use.
6159    * @param fs Filesystem to use
6160    * @param rootDir Root directory for HBase instance
6161    * @param info Info for region to be opened.
6162    * @param htd the table descriptor
6163    * @param wal WAL for region to use. This method will call
6164    * WAL#setSequenceNumber(long) passing the result of the call to
6165    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6166    * up.  HRegionStore does this every time it opens a new region.
6167    * @param rsServices An interface we can request flushes against.
6168    * @param reporter An interface we can report progress against.
6169    * @return new HRegion
6170    * @throws IOException
6171    */
6172   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6173       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd,
6174       final WAL wal, final RegionServerServices rsServices,
6175       final CancelableProgressable reporter)
6176       throws IOException {
6177     if (info == null) throw new NullPointerException("Passed region info is null");
6178     if (LOG.isDebugEnabled()) {
6179       LOG.debug("Opening region: " + info);
6180     }
6181     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6182     return r.openHRegion(reporter);
6183   }
6184 
6185 
6186   /**
6187    * Useful when reopening a closed region (normally for unit tests)
6188    * @param other original object
6189    * @param reporter An interface we can report progress against.
6190    * @return new HRegion
6191    * @throws IOException
6192    */
6193   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
6194       throws IOException {
6195     HRegionFileSystem regionFs = other.getRegionFileSystem();
6196     HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
6197         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
6198     return r.openHRegion(reporter);
6199   }
6200 
6201   /**
6202    * Open HRegion.
6203    * Calls initialize and sets sequenceId.
6204    * @return Returns <code>this</code>
6205    * @throws IOException
6206    */
6207   protected HRegion openHRegion(final CancelableProgressable reporter)
6208   throws IOException {
6209     // Refuse to open the region if we are missing local compression support
6210     checkCompressionCodecs();
6211     // Refuse to open the region if encryption configuration is incorrect or
6212     // codec support is missing
6213     checkEncryption();
6214     // Refuse to open the region if a required class cannot be loaded
6215     checkClassLoading();
6216     this.openSeqNum = initialize(reporter);
6217     this.setSequenceId(openSeqNum);
6218     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
6219         && !isRecovering) {
6220       // Only write the region open event marker to WAL if (1) we are not read-only
6221       // (2) dist log replay is off or we are not recovering. In case region is
6222       // recovering, the open event will be written at setRecovering(false)
6223       writeRegionOpenMarker(wal, openSeqNum);
6224     }
6225     return this;
6226   }
6227 
6228   private void checkCompressionCodecs() throws IOException {
6229     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6230       CompressionTest.testCompression(fam.getCompression());
6231       CompressionTest.testCompression(fam.getCompactionCompression());
6232     }
6233   }
6234 
6235   private void checkEncryption() throws IOException {
6236     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6237       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
6238     }
6239   }
6240 
6241   private void checkClassLoading() throws IOException {
6242     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
6243     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
6244   }
6245 
6246   /**
6247    * Create a daughter region from given a temp directory with the region data.
6248    * @param hri Spec. for daughter region to open.
6249    * @throws IOException
6250    */
6251   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
6252     // Move the files from the temporary .splits to the final /table/region directory
6253     fs.commitDaughterRegion(hri);
6254 
6255     // Create the daughter HRegion instance
6256     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(), fs.getFileSystem(),
6257         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
6258     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
6259     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
6260     return r;
6261   }
6262 
6263   /**
6264    * Create a merged region given a temp directory with the region data.
6265    * @param region_b another merging region
6266    * @return merged HRegion
6267    * @throws IOException
6268    */
6269   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
6270       final HRegion region_b) throws IOException {
6271     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(),
6272         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
6273         this.getTableDesc(), this.rsServices);
6274     r.readRequestsCount.set(this.getReadRequestsCount()
6275         + region_b.getReadRequestsCount());
6276     r.writeRequestsCount.set(this.getWriteRequestsCount()
6277 
6278         + region_b.getWriteRequestsCount());
6279     this.fs.commitMergedRegion(mergedRegionInfo);
6280     return r;
6281   }
6282 
6283   /**
6284    * Inserts a new region's meta information into the passed
6285    * <code>meta</code> region. Used by the HMaster bootstrap code adding
6286    * new table to hbase:meta table.
6287    *
6288    * @param meta hbase:meta HRegion to be updated
6289    * @param r HRegion to add to <code>meta</code>
6290    *
6291    * @throws IOException
6292    */
6293   // TODO remove since only test and merge use this
6294   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
6295     meta.checkResources();
6296     // The row key is the region name
6297     byte[] row = r.getRegionName();
6298     final long now = EnvironmentEdgeManager.currentTime();
6299     final List<Cell> cells = new ArrayList<Cell>(2);
6300     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6301       HConstants.REGIONINFO_QUALIFIER, now,
6302       r.getRegionInfo().toByteArray()));
6303     // Set into the root table the version of the meta table.
6304     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6305       HConstants.META_VERSION_QUALIFIER, now,
6306       Bytes.toBytes(HConstants.META_VERSION)));
6307     meta.put(row, HConstants.CATALOG_FAMILY, cells);
6308   }
6309 
6310   /**
6311    * Computes the Path of the HRegion
6312    *
6313    * @param tabledir qualified path for table
6314    * @param name ENCODED region name
6315    * @return Path of HRegion directory
6316    */
6317   @Deprecated
6318   public static Path getRegionDir(final Path tabledir, final String name) {
6319     return new Path(tabledir, name);
6320   }
6321 
6322   /**
6323    * Computes the Path of the HRegion
6324    *
6325    * @param rootdir qualified path of HBase root directory
6326    * @param info HRegionInfo for the region
6327    * @return qualified path of region directory
6328    */
6329   @Deprecated
6330   @VisibleForTesting
6331   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
6332     return new Path(
6333       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
6334   }
6335 
6336   /**
6337    * Determines if the specified row is within the row range specified by the
6338    * specified HRegionInfo
6339    *
6340    * @param info HRegionInfo that specifies the row range
6341    * @param row row to be checked
6342    * @return true if the row is within the range specified by the HRegionInfo
6343    */
6344   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
6345     return ((info.getStartKey().length == 0) ||
6346         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
6347         ((info.getEndKey().length == 0) ||
6348             (Bytes.compareTo(info.getEndKey(), row) > 0));
6349   }
6350 
6351   /**
6352    * Merge two HRegions.  The regions must be adjacent and must not overlap.
6353    *
6354    * @return new merged HRegion
6355    * @throws IOException
6356    */
6357   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
6358   throws IOException {
6359     HRegion a = srcA;
6360     HRegion b = srcB;
6361 
6362     // Make sure that srcA comes first; important for key-ordering during
6363     // write of the merged file.
6364     if (srcA.getStartKey() == null) {
6365       if (srcB.getStartKey() == null) {
6366         throw new IOException("Cannot merge two regions with null start key");
6367       }
6368       // A's start key is null but B's isn't. Assume A comes before B
6369     } else if ((srcB.getStartKey() == null) ||
6370       (Bytes.compareTo(srcA.getStartKey(), srcB.getStartKey()) > 0)) {
6371       a = srcB;
6372       b = srcA;
6373     }
6374 
6375     if (!(Bytes.compareTo(a.getEndKey(), b.getStartKey()) == 0)) {
6376       throw new IOException("Cannot merge non-adjacent regions");
6377     }
6378     return merge(a, b);
6379   }
6380 
6381   /**
6382    * Merge two regions whether they are adjacent or not.
6383    *
6384    * @param a region a
6385    * @param b region b
6386    * @return new merged region
6387    * @throws IOException
6388    */
6389   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
6390     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
6391       throw new IOException("Regions do not belong to the same table");
6392     }
6393 
6394     FileSystem fs = a.getRegionFileSystem().getFileSystem();
6395     // Make sure each region's cache is empty
6396     a.flushcache();
6397     b.flushcache();
6398 
6399     // Compact each region so we only have one store file per family
6400     a.compactStores(true);
6401     if (LOG.isDebugEnabled()) {
6402       LOG.debug("Files for region: " + a);
6403       a.getRegionFileSystem().logFileSystemState(LOG);
6404     }
6405     b.compactStores(true);
6406     if (LOG.isDebugEnabled()) {
6407       LOG.debug("Files for region: " + b);
6408       b.getRegionFileSystem().logFileSystemState(LOG);
6409     }
6410 
6411     RegionMergeTransaction rmt = new RegionMergeTransaction(a, b, true);
6412     if (!rmt.prepare(null)) {
6413       throw new IOException("Unable to merge regions " + a + " and " + b);
6414     }
6415     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
6416     LOG.info("starting merge of regions: " + a + " and " + b
6417         + " into new region " + mergedRegionInfo.getRegionNameAsString()
6418         + " with start key <"
6419         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
6420         + "> and end key <"
6421         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
6422     HRegion dstRegion;
6423     try {
6424       dstRegion = rmt.execute(null, null);
6425     } catch (IOException ioe) {
6426       rmt.rollback(null, null);
6427       throw new IOException("Failed merging region " + a + " and " + b
6428           + ", and successfully rolled back");
6429     }
6430     dstRegion.compactStores(true);
6431 
6432     if (LOG.isDebugEnabled()) {
6433       LOG.debug("Files for new region");
6434       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
6435     }
6436 
6437     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
6438       throw new IOException("Merged region " + dstRegion
6439           + " still has references after the compaction, is compaction canceled?");
6440     }
6441 
6442     // Archiving the 'A' region
6443     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
6444     // Archiving the 'B' region
6445     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
6446 
6447     LOG.info("merge completed. New region is " + dstRegion);
6448     return dstRegion;
6449   }
6450 
6451   //
6452   // HBASE-880
6453   //
6454   /**
6455    * @param get get object
6456    * @return result
6457    * @throws IOException read exceptions
6458    */
6459   public Result get(final Get get) throws IOException {
6460     checkRow(get.getRow(), "Get");
6461     // Verify families are all valid
6462     if (get.hasFamilies()) {
6463       for (byte [] family: get.familySet()) {
6464         checkFamily(family);
6465       }
6466     } else { // Adding all families to scanner
6467       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
6468         get.addFamily(family);
6469       }
6470     }
6471     List<Cell> results = get(get, true);
6472     boolean stale = this.getRegionInfo().getReplicaId() != 0;
6473     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
6474   }
6475 
6476   /*
6477    * Do a get based on the get parameter.
6478    * @param withCoprocessor invoke coprocessor or not. We don't want to
6479    * always invoke cp for this private method.
6480    */
6481   public List<Cell> get(Get get, boolean withCoprocessor)
6482   throws IOException {
6483 
6484     List<Cell> results = new ArrayList<Cell>();
6485 
6486     // pre-get CP hook
6487     if (withCoprocessor && (coprocessorHost != null)) {
6488        if (coprocessorHost.preGet(get, results)) {
6489          return results;
6490        }
6491     }
6492 
6493     Scan scan = new Scan(get);
6494 
6495     RegionScanner scanner = null;
6496     try {
6497       scanner = getScanner(scan);
6498       scanner.next(results);
6499     } finally {
6500       if (scanner != null)
6501         scanner.close();
6502     }
6503 
6504     // post-get CP hook
6505     if (withCoprocessor && (coprocessorHost != null)) {
6506       coprocessorHost.postGet(get, results);
6507     }
6508 
6509     // do after lock
6510     if (this.metricsRegion != null) {
6511       long totalSize = 0L;
6512       for (Cell cell : results) {
6513         totalSize += CellUtil.estimatedSerializedSizeOf(cell);
6514       }
6515       this.metricsRegion.updateGet(totalSize);
6516     }
6517 
6518     return results;
6519   }
6520 
6521   public void mutateRow(RowMutations rm) throws IOException {
6522     // Don't need nonces here - RowMutations only supports puts and deletes
6523     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
6524   }
6525 
6526   /**
6527    * Perform atomic mutations within the region w/o nonces.
6528    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
6529    */
6530   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6531       Collection<byte[]> rowsToLock) throws IOException {
6532     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
6533   }
6534 
6535   /**
6536    * Perform atomic mutations within the region.
6537    * @param mutations The list of mutations to perform.
6538    * <code>mutations</code> can contain operations for multiple rows.
6539    * Caller has to ensure that all rows are contained in this region.
6540    * @param rowsToLock Rows to lock
6541    * @param nonceGroup Optional nonce group of the operation (client Id)
6542    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6543    * If multiple rows are locked care should be taken that
6544    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
6545    * @throws IOException
6546    */
6547   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6548       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
6549     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
6550     processRowsWithLocks(proc, -1, nonceGroup, nonce);
6551   }
6552 
6553   /**
6554    * @return the current load statistics for the the region
6555    */
6556   public ClientProtos.RegionLoadStats getRegionStats() {
6557     if (!regionStatsEnabled) {
6558       return null;
6559     }
6560     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
6561     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
6562         .memstoreFlushSize)));
6563     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
6564     return stats.build();
6565   }
6566 
6567   /**
6568    * Performs atomic multiple reads and writes on a given row.
6569    *
6570    * @param processor The object defines the reads and writes to a row.
6571    * @param nonceGroup Optional nonce group of the operation (client Id)
6572    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6573    */
6574   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
6575       throws IOException {
6576     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
6577   }
6578 
6579   /**
6580    * Performs atomic multiple reads and writes on a given row.
6581    *
6582    * @param processor The object defines the reads and writes to a row.
6583    * @param timeout The timeout of the processor.process() execution
6584    *                Use a negative number to switch off the time bound
6585    * @param nonceGroup Optional nonce group of the operation (client Id)
6586    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6587    */
6588   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
6589       long nonceGroup, long nonce) throws IOException {
6590 
6591     for (byte[] row : processor.getRowsToLock()) {
6592       checkRow(row, "processRowsWithLocks");
6593     }
6594     if (!processor.readOnly()) {
6595       checkReadOnly();
6596     }
6597     checkResources();
6598 
6599     startRegionOperation();
6600     WALEdit walEdit = new WALEdit();
6601 
6602     // 1. Run pre-process hook
6603     try {
6604       processor.preProcess(this, walEdit);
6605     } catch (IOException e) {
6606       closeRegionOperation();
6607       throw e;
6608     }
6609     // Short circuit the read only case
6610     if (processor.readOnly()) {
6611       try {
6612         long now = EnvironmentEdgeManager.currentTime();
6613         doProcessRowWithTimeout(
6614             processor, now, this, null, null, timeout);
6615         processor.postProcess(this, walEdit, true);
6616       } finally {
6617         closeRegionOperation();
6618       }
6619       return;
6620     }
6621 
6622     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
6623     boolean locked;
6624     boolean walSyncSuccessful = false;
6625     List<RowLock> acquiredRowLocks;
6626     long addedSize = 0;
6627     List<Mutation> mutations = new ArrayList<Mutation>();
6628     List<Cell> memstoreCells = new ArrayList<Cell>();
6629     Collection<byte[]> rowsToLock = processor.getRowsToLock();
6630     long mvccNum = 0;
6631     WALKey walKey = null;
6632     try {
6633       // 2. Acquire the row lock(s)
6634       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
6635       for (byte[] row : rowsToLock) {
6636         // Attempt to lock all involved rows, throw if any lock times out
6637         acquiredRowLocks.add(getRowLock(row));
6638       }
6639       // 3. Region lock
6640       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
6641       locked = true;
6642       // Get a mvcc write number
6643       mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
6644 
6645       long now = EnvironmentEdgeManager.currentTime();
6646       try {
6647         // 4. Let the processor scan the rows, generate mutations and add
6648         //    waledits
6649         doProcessRowWithTimeout(
6650             processor, now, this, mutations, walEdit, timeout);
6651 
6652         if (!mutations.isEmpty()) {
6653           // 5. Start mvcc transaction
6654           writeEntry = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
6655           // 6. Call the preBatchMutate hook
6656           processor.preBatchMutate(this, walEdit);
6657           // 7. Apply to memstore
6658           for (Mutation m : mutations) {
6659             // Handle any tag based cell features
6660             rewriteCellTags(m.getFamilyCellMap(), m);
6661 
6662             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
6663               Cell cell = cellScanner.current();
6664               CellUtil.setSequenceId(cell, mvccNum);
6665               Store store = getStore(cell);
6666               if (store == null) {
6667                 checkFamily(CellUtil.cloneFamily(cell));
6668                 // unreachable
6669               }
6670               Pair<Long, Cell> ret = store.add(cell);
6671               addedSize += ret.getFirst();
6672               memstoreCells.add(ret.getSecond());
6673             }
6674           }
6675 
6676           long txid = 0;
6677           // 8. Append no sync
6678           if (!walEdit.isEmpty()) {
6679             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
6680             walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
6681               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
6682               processor.getClusterIds(), nonceGroup, nonce);
6683             txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
6684               walKey, walEdit, getSequenceId(), true, memstoreCells);
6685           }
6686           if(walKey == null){
6687             // since we use wal sequence Id as mvcc, for SKIP_WAL changes we need a "faked" WALEdit
6688             // to get a sequence id assigned which is done by FSWALEntry#stampRegionSequenceId
6689             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
6690           }
6691           // 9. Release region lock
6692           if (locked) {
6693             this.updatesLock.readLock().unlock();
6694             locked = false;
6695           }
6696 
6697           // 10. Release row lock(s)
6698           releaseRowLocks(acquiredRowLocks);
6699 
6700           // 11. Sync edit log
6701           if (txid != 0) {
6702             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
6703           }
6704           walSyncSuccessful = true;
6705           // 12. call postBatchMutate hook
6706           processor.postBatchMutate(this);
6707         }
6708       } finally {
6709         if (!mutations.isEmpty() && !walSyncSuccessful) {
6710           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
6711               " memstore keyvalues for row(s):" + StringUtils.byteToHexString(
6712               processor.getRowsToLock().iterator().next()) + "...");
6713           for (Mutation m : mutations) {
6714             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
6715               Cell cell = cellScanner.current();
6716               getStore(cell).rollback(cell);
6717             }
6718           }
6719         }
6720         // 13. Roll mvcc forward
6721         if (writeEntry != null) {
6722           mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
6723         }
6724         if (locked) {
6725           this.updatesLock.readLock().unlock();
6726         }
6727         // release locks if some were acquired but another timed out
6728         releaseRowLocks(acquiredRowLocks);
6729       }
6730 
6731       // 14. Run post-process hook
6732       processor.postProcess(this, walEdit, walSyncSuccessful);
6733 
6734     } finally {
6735       closeRegionOperation();
6736       if (!mutations.isEmpty() &&
6737           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
6738         requestFlush();
6739       }
6740     }
6741   }
6742 
6743   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
6744                                        final long now,
6745                                        final HRegion region,
6746                                        final List<Mutation> mutations,
6747                                        final WALEdit walEdit,
6748                                        final long timeout) throws IOException {
6749     // Short circuit the no time bound case.
6750     if (timeout < 0) {
6751       try {
6752         processor.process(now, region, mutations, walEdit);
6753       } catch (IOException e) {
6754         LOG.warn("RowProcessor:" + processor.getClass().getName() +
6755             " throws Exception on row(s):" +
6756             Bytes.toStringBinary(
6757               processor.getRowsToLock().iterator().next()) + "...", e);
6758         throw e;
6759       }
6760       return;
6761     }
6762 
6763     // Case with time bound
6764     FutureTask<Void> task =
6765       new FutureTask<Void>(new Callable<Void>() {
6766         @Override
6767         public Void call() throws IOException {
6768           try {
6769             processor.process(now, region, mutations, walEdit);
6770             return null;
6771           } catch (IOException e) {
6772             LOG.warn("RowProcessor:" + processor.getClass().getName() +
6773                 " throws Exception on row(s):" +
6774                 Bytes.toStringBinary(
6775                     processor.getRowsToLock().iterator().next()) + "...", e);
6776             throw e;
6777           }
6778         }
6779       });
6780     rowProcessorExecutor.execute(task);
6781     try {
6782       task.get(timeout, TimeUnit.MILLISECONDS);
6783     } catch (TimeoutException te) {
6784       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
6785           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
6786           "...");
6787       throw new IOException(te);
6788     } catch (Exception e) {
6789       throw new IOException(e);
6790     }
6791   }
6792 
6793   public Result append(Append append) throws IOException {
6794     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
6795   }
6796 
6797   // TODO: There's a lot of boiler plate code identical to increment.
6798   // We should refactor append and increment as local get-mutate-put
6799   // transactions, so all stores only go through one code path for puts.
6800   /**
6801    * Perform one or more append operations on a row.
6802    *
6803    * @return new keyvalues after increment
6804    * @throws IOException
6805    */
6806   public Result append(Append append, long nonceGroup, long nonce)
6807       throws IOException {
6808     byte[] row = append.getRow();
6809     checkRow(row, "append");
6810     boolean flush = false;
6811     Durability durability = getEffectiveDurability(append.getDurability());
6812     boolean writeToWAL = durability != Durability.SKIP_WAL;
6813     WALEdit walEdits = null;
6814     List<Cell> allKVs = new ArrayList<Cell>(append.size());
6815     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
6816     long size = 0;
6817     long txid = 0;
6818 
6819     checkReadOnly();
6820     checkResources();
6821     // Lock row
6822     startRegionOperation(Operation.APPEND);
6823     this.writeRequestsCount.increment();
6824     long mvccNum = 0;
6825     WriteEntry w = null;
6826     WALKey walKey = null;
6827     RowLock rowLock = null;
6828     List<Cell> memstoreCells = new ArrayList<Cell>();
6829     boolean doRollBackMemstore = false;
6830     try {
6831       rowLock = getRowLock(row);
6832       try {
6833         lock(this.updatesLock.readLock());
6834         try {
6835           // wait for all prior MVCC transactions to finish - while we hold the row lock
6836           // (so that we are guaranteed to see the latest state)
6837           mvcc.waitForPreviousTransactionsComplete();
6838           if (this.coprocessorHost != null) {
6839             Result r = this.coprocessorHost.preAppendAfterRowLock(append);
6840             if(r!= null) {
6841               return r;
6842             }
6843           }
6844           // now start my own transaction
6845           mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
6846           w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
6847           long now = EnvironmentEdgeManager.currentTime();
6848           // Process each family
6849           for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
6850 
6851             Store store = stores.get(family.getKey());
6852             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
6853 
6854             // Sort the cells so that they match the order that they
6855             // appear in the Get results. Otherwise, we won't be able to
6856             // find the existing values if the cells are not specified
6857             // in order by the client since cells are in an array list.
6858             Collections.sort(family.getValue(), store.getComparator());
6859             // Get previous values for all columns in this family
6860             Get get = new Get(row);
6861             for (Cell cell : family.getValue()) {
6862               get.addColumn(family.getKey(), CellUtil.cloneQualifier(cell));
6863             }
6864             List<Cell> results = get(get, false);
6865             // Iterate the input columns and update existing values if they were
6866             // found, otherwise add new column initialized to the append value
6867 
6868             // Avoid as much copying as possible. We may need to rewrite and
6869             // consolidate tags. Bytes are only copied once.
6870             // Would be nice if KeyValue had scatter/gather logic
6871             int idx = 0;
6872             for (Cell cell : family.getValue()) {
6873               Cell newCell;
6874               Cell oldCell = null;
6875               if (idx < results.size()
6876                   && CellUtil.matchingQualifier(results.get(idx), cell)) {
6877                 oldCell = results.get(idx);
6878                 long ts = Math.max(now, oldCell.getTimestamp());
6879 
6880                 // Process cell tags
6881                 List<Tag> newTags = new ArrayList<Tag>();
6882 
6883                 // Make a union of the set of tags in the old and new KVs
6884 
6885                 if (oldCell.getTagsLength() > 0) {
6886                   Iterator<Tag> i = CellUtil.tagsIterator(oldCell.getTagsArray(),
6887                     oldCell.getTagsOffset(), oldCell.getTagsLength());
6888                   while (i.hasNext()) {
6889                     newTags.add(i.next());
6890                   }
6891                 }
6892                 if (cell.getTagsLength() > 0) {
6893                   Iterator<Tag> i  = CellUtil.tagsIterator(cell.getTagsArray(),
6894                     cell.getTagsOffset(), cell.getTagsLength());
6895                   while (i.hasNext()) {
6896                     newTags.add(i.next());
6897                   }
6898                 }
6899 
6900                 // Cell TTL handling
6901 
6902                 if (append.getTTL() != Long.MAX_VALUE) {
6903                   // Add the new TTL tag
6904                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
6905                 }
6906 
6907                 // Rebuild tags
6908                 byte[] tagBytes = Tag.fromList(newTags);
6909 
6910                 // allocate an empty cell once
6911                 newCell = new KeyValue(row.length, cell.getFamilyLength(),
6912                     cell.getQualifierLength(), ts, KeyValue.Type.Put,
6913                     oldCell.getValueLength() + cell.getValueLength(),
6914                     tagBytes.length);
6915                 // copy in row, family, and qualifier
6916                 System.arraycopy(cell.getRowArray(), cell.getRowOffset(),
6917                   newCell.getRowArray(), newCell.getRowOffset(), cell.getRowLength());
6918                 System.arraycopy(cell.getFamilyArray(), cell.getFamilyOffset(),
6919                   newCell.getFamilyArray(), newCell.getFamilyOffset(),
6920                   cell.getFamilyLength());
6921                 System.arraycopy(cell.getQualifierArray(), cell.getQualifierOffset(),
6922                   newCell.getQualifierArray(), newCell.getQualifierOffset(),
6923                   cell.getQualifierLength());
6924                 // copy in the value
6925                 System.arraycopy(oldCell.getValueArray(), oldCell.getValueOffset(),
6926                   newCell.getValueArray(), newCell.getValueOffset(),
6927                   oldCell.getValueLength());
6928                 System.arraycopy(cell.getValueArray(), cell.getValueOffset(),
6929                   newCell.getValueArray(),
6930                   newCell.getValueOffset() + oldCell.getValueLength(),
6931                   cell.getValueLength());
6932                 // Copy in tag data
6933                 System.arraycopy(tagBytes, 0, newCell.getTagsArray(), newCell.getTagsOffset(),
6934                   tagBytes.length);
6935                 idx++;
6936               } else {
6937                 // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP
6938                 CellUtil.updateLatestStamp(cell, now);
6939 
6940                 // Cell TTL handling
6941 
6942                 if (append.getTTL() != Long.MAX_VALUE) {
6943                   List<Tag> newTags = new ArrayList<Tag>(1);
6944                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
6945                   // Add the new TTL tag
6946                   newCell = new KeyValue(cell.getRowArray(), cell.getRowOffset(),
6947                       cell.getRowLength(),
6948                     cell.getFamilyArray(), cell.getFamilyOffset(),
6949                       cell.getFamilyLength(),
6950                     cell.getQualifierArray(), cell.getQualifierOffset(),
6951                       cell.getQualifierLength(),
6952                     cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
6953                     cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
6954                     newTags);
6955                 } else {
6956                   newCell = cell;
6957                 }
6958               }
6959 
6960               CellUtil.setSequenceId(newCell, mvccNum);
6961               // Give coprocessors a chance to update the new cell
6962               if (coprocessorHost != null) {
6963                 newCell = coprocessorHost.postMutationBeforeWAL(RegionObserver.MutationType.APPEND,
6964                     append, oldCell, newCell);
6965               }
6966               kvs.add(newCell);
6967 
6968               // Append update to WAL
6969               if (writeToWAL) {
6970                 if (walEdits == null) {
6971                   walEdits = new WALEdit();
6972                 }
6973                 walEdits.add(newCell);
6974               }
6975             }
6976 
6977             //store the kvs to the temporary memstore before writing WAL
6978             tempMemstore.put(store, kvs);
6979           }
6980 
6981           //Actually write to Memstore now
6982           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
6983             Store store = entry.getKey();
6984             if (store.getFamily().getMaxVersions() == 1) {
6985               // upsert if VERSIONS for this CF == 1
6986               size += store.upsert(entry.getValue(), getSmallestReadPoint());
6987               memstoreCells.addAll(entry.getValue());
6988             } else {
6989               // otherwise keep older versions around
6990               for (Cell cell: entry.getValue()) {
6991                 Pair<Long, Cell> ret = store.add(cell);
6992                 size += ret.getFirst();
6993                 memstoreCells.add(ret.getSecond());
6994                 doRollBackMemstore = true;
6995               }
6996             }
6997             allKVs.addAll(entry.getValue());
6998           }
6999 
7000           // Actually write to WAL now
7001           if (writeToWAL) {
7002             // Using default cluster id, as this can only happen in the originating
7003             // cluster. A slave cluster receives the final value (not the delta)
7004             // as a Put.
7005             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7006             walKey = new HLogKey(getRegionInfo().getEncodedNameAsBytes(),
7007               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, nonceGroup, nonce);
7008             txid = this.wal.append(this.htableDescriptor, getRegionInfo(), walKey, walEdits,
7009               this.sequenceId, true, memstoreCells);
7010           } else {
7011             recordMutationWithoutWal(append.getFamilyCellMap());
7012           }
7013           if (walKey == null) {
7014             // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
7015             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
7016           }
7017           size = this.addAndGetGlobalMemstoreSize(size);
7018           flush = isFlushSize(size);
7019         } finally {
7020           this.updatesLock.readLock().unlock();
7021         }
7022       } finally {
7023         rowLock.release();
7024         rowLock = null;
7025       }
7026       // sync the transaction log outside the rowlock
7027       if(txid != 0){
7028         syncOrDefer(txid, durability);
7029       }
7030       doRollBackMemstore = false;
7031     } finally {
7032       if (rowLock != null) {
7033         rowLock.release();
7034       }
7035       // if the wal sync was unsuccessful, remove keys from memstore
7036       if (doRollBackMemstore) {
7037         rollbackMemstore(memstoreCells);
7038       }
7039       if (w != null) {
7040         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
7041       }
7042       closeRegionOperation(Operation.APPEND);
7043     }
7044 
7045     if (this.metricsRegion != null) {
7046       this.metricsRegion.updateAppend();
7047     }
7048 
7049     if (flush) {
7050       // Request a cache flush. Do it outside update lock.
7051       requestFlush();
7052     }
7053 
7054 
7055     return append.isReturnResults() ? Result.create(allKVs) : null;
7056   }
7057 
7058   public Result increment(Increment increment) throws IOException {
7059     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
7060   }
7061 
7062   // TODO: There's a lot of boiler plate code identical to append.
7063   // We should refactor append and increment as local get-mutate-put
7064   // transactions, so all stores only go through one code path for puts.
7065   /**
7066    * Perform one or more increment operations on a row.
7067    * @return new keyvalues after increment
7068    * @throws IOException
7069    */
7070   public Result increment(Increment increment, long nonceGroup, long nonce)
7071   throws IOException {
7072     byte [] row = increment.getRow();
7073     checkRow(row, "increment");
7074     TimeRange tr = increment.getTimeRange();
7075     boolean flush = false;
7076     Durability durability = getEffectiveDurability(increment.getDurability());
7077     boolean writeToWAL = durability != Durability.SKIP_WAL;
7078     WALEdit walEdits = null;
7079     List<Cell> allKVs = new ArrayList<Cell>(increment.size());
7080     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
7081 
7082     long size = 0;
7083     long txid = 0;
7084 
7085     checkReadOnly();
7086     checkResources();
7087     // Lock row
7088     startRegionOperation(Operation.INCREMENT);
7089     this.writeRequestsCount.increment();
7090     RowLock rowLock = null;
7091     WriteEntry w = null;
7092     WALKey walKey = null;
7093     long mvccNum = 0;
7094     List<Cell> memstoreCells = new ArrayList<Cell>();
7095     boolean doRollBackMemstore = false;
7096     try {
7097       rowLock = getRowLock(row);
7098       try {
7099         lock(this.updatesLock.readLock());
7100         try {
7101           // wait for all prior MVCC transactions to finish - while we hold the row lock
7102           // (so that we are guaranteed to see the latest state)
7103           mvcc.waitForPreviousTransactionsComplete();
7104           if (this.coprocessorHost != null) {
7105             Result r = this.coprocessorHost.preIncrementAfterRowLock(increment);
7106             if (r != null) {
7107               return r;
7108             }
7109           }
7110           // now start my own transaction
7111           mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
7112           w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
7113           long now = EnvironmentEdgeManager.currentTime();
7114           // Process each family
7115           for (Map.Entry<byte [], List<Cell>> family:
7116               increment.getFamilyCellMap().entrySet()) {
7117 
7118             Store store = stores.get(family.getKey());
7119             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
7120 
7121             // Sort the cells so that they match the order that they
7122             // appear in the Get results. Otherwise, we won't be able to
7123             // find the existing values if the cells are not specified
7124             // in order by the client since cells are in an array list.
7125             Collections.sort(family.getValue(), store.getComparator());
7126             // Get previous values for all columns in this family
7127             Get get = new Get(row);
7128             for (Cell cell: family.getValue()) {
7129               get.addColumn(family.getKey(),  CellUtil.cloneQualifier(cell));
7130             }
7131             get.setTimeRange(tr.getMin(), tr.getMax());
7132             List<Cell> results = get(get, false);
7133 
7134             // Iterate the input columns and update existing values if they were
7135             // found, otherwise add new column initialized to the increment amount
7136             int idx = 0;
7137             List<Cell> edits = family.getValue();
7138             for (int i = 0; i < edits.size(); i++) {
7139               Cell cell = edits.get(i);
7140               long amount = Bytes.toLong(CellUtil.cloneValue(cell));
7141               boolean noWriteBack = (amount == 0);
7142               List<Tag> newTags = new ArrayList<Tag>();
7143 
7144               // Carry forward any tags that might have been added by a coprocessor
7145               if (cell.getTagsLength() > 0) {
7146                 Iterator<Tag> itr = CellUtil.tagsIterator(cell.getTagsArray(),
7147                   cell.getTagsOffset(), cell.getTagsLength());
7148                 while (itr.hasNext()) {
7149                   newTags.add(itr.next());
7150                 }
7151               }
7152 
7153               Cell c = null;
7154               long ts = now;
7155               if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), cell)) {
7156                 c = results.get(idx);
7157                 ts = Math.max(now, c.getTimestamp());
7158                 if(c.getValueLength() == Bytes.SIZEOF_LONG) {
7159                   amount += Bytes.toLong(c.getValueArray(), c.getValueOffset(), Bytes.SIZEOF_LONG);
7160                 } else {
7161                   // throw DoNotRetryIOException instead of IllegalArgumentException
7162                   throw new org.apache.hadoop.hbase.DoNotRetryIOException(
7163                       "Attempted to increment field that isn't 64 bits wide");
7164                 }
7165                 // Carry tags forward from previous version
7166                 if (c.getTagsLength() > 0) {
7167                   Iterator<Tag> itr = CellUtil.tagsIterator(c.getTagsArray(),
7168                     c.getTagsOffset(), c.getTagsLength());
7169                   while (itr.hasNext()) {
7170                     newTags.add(itr.next());
7171                   }
7172                 }
7173                 if (i < ( edits.size() - 1) && !CellUtil.matchingQualifier(cell, edits.get(i + 1)))
7174                   idx++;
7175               }
7176 
7177               // Append new incremented KeyValue to list
7178               byte[] q = CellUtil.cloneQualifier(cell);
7179               byte[] val = Bytes.toBytes(amount);
7180 
7181               // Add the TTL tag if the mutation carried one
7182               if (increment.getTTL() != Long.MAX_VALUE) {
7183                 newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(increment.getTTL())));
7184               }
7185 
7186               Cell newKV = new KeyValue(row, 0, row.length,
7187                 family.getKey(), 0, family.getKey().length,
7188                 q, 0, q.length,
7189                 ts,
7190                 KeyValue.Type.Put,
7191                 val, 0, val.length,
7192                 newTags);
7193 
7194               CellUtil.setSequenceId(newKV, mvccNum);
7195 
7196               // Give coprocessors a chance to update the new cell
7197               if (coprocessorHost != null) {
7198                 newKV = coprocessorHost.postMutationBeforeWAL(
7199                     RegionObserver.MutationType.INCREMENT, increment, c, newKV);
7200               }
7201               allKVs.add(newKV);
7202 
7203               if (!noWriteBack) {
7204                 kvs.add(newKV);
7205 
7206                 // Prepare WAL updates
7207                 if (writeToWAL) {
7208                   if (walEdits == null) {
7209                     walEdits = new WALEdit();
7210                   }
7211                   walEdits.add(newKV);
7212                 }
7213               }
7214             }
7215 
7216             //store the kvs to the temporary memstore before writing WAL
7217             if (!kvs.isEmpty()) {
7218               tempMemstore.put(store, kvs);
7219             }
7220           }
7221 
7222           //Actually write to Memstore now
7223           if (!tempMemstore.isEmpty()) {
7224             for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
7225               Store store = entry.getKey();
7226               if (store.getFamily().getMaxVersions() == 1) {
7227                 // upsert if VERSIONS for this CF == 1
7228                 size += store.upsert(entry.getValue(), getSmallestReadPoint());
7229                 memstoreCells.addAll(entry.getValue());
7230               } else {
7231                 // otherwise keep older versions around
7232                 for (Cell cell : entry.getValue()) {
7233                   Pair<Long, Cell> ret = store.add(cell);
7234                   size += ret.getFirst();
7235                   memstoreCells.add(ret.getSecond());
7236                   doRollBackMemstore = true;
7237                 }
7238               }
7239             }
7240             size = this.addAndGetGlobalMemstoreSize(size);
7241             flush = isFlushSize(size);
7242           }
7243 
7244           // Actually write to WAL now
7245           if (walEdits != null && !walEdits.isEmpty()) {
7246             if (writeToWAL) {
7247               // Using default cluster id, as this can only happen in the originating
7248               // cluster. A slave cluster receives the final value (not the delta)
7249               // as a Put.
7250               // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7251               walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
7252                 this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, nonceGroup, nonce);
7253               txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
7254                 walKey, walEdits, getSequenceId(), true, memstoreCells);
7255             } else {
7256               recordMutationWithoutWal(increment.getFamilyCellMap());
7257             }
7258           }
7259           if(walKey == null){
7260             // Append a faked WALEdit in order for SKIP_WAL updates to get mvccNum assigned
7261             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
7262           }
7263         } finally {
7264           this.updatesLock.readLock().unlock();
7265         }
7266       } finally {
7267         rowLock.release();
7268         rowLock = null;
7269       }
7270       // sync the transaction log outside the rowlock
7271       if(txid != 0){
7272         syncOrDefer(txid, durability);
7273       }
7274       doRollBackMemstore = false;
7275     } finally {
7276       if (rowLock != null) {
7277         rowLock.release();
7278       }
7279       // if the wal sync was unsuccessful, remove keys from memstore
7280       if (doRollBackMemstore) {
7281         rollbackMemstore(memstoreCells);
7282       }
7283       if (w != null) {
7284         mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
7285       }
7286       closeRegionOperation(Operation.INCREMENT);
7287       if (this.metricsRegion != null) {
7288         this.metricsRegion.updateIncrement();
7289       }
7290     }
7291 
7292     if (flush) {
7293       // Request a cache flush.  Do it outside update lock.
7294       requestFlush();
7295     }
7296 
7297     return Result.create(allKVs);
7298   }
7299 
7300   //
7301   // New HBASE-880 Helpers
7302   //
7303 
7304   private void checkFamily(final byte [] family)
7305   throws NoSuchColumnFamilyException {
7306     if (!this.htableDescriptor.hasFamily(family)) {
7307       throw new NoSuchColumnFamilyException("Column family " +
7308           Bytes.toString(family) + " does not exist in region " + this
7309           + " in table " + this.htableDescriptor);
7310     }
7311   }
7312 
7313   public static final long FIXED_OVERHEAD = ClassSize.align(
7314       ClassSize.OBJECT +
7315       ClassSize.ARRAY +
7316       45 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
7317       (14 * Bytes.SIZEOF_LONG) +
7318       5 * Bytes.SIZEOF_BOOLEAN);
7319 
7320   // woefully out of date - currently missing:
7321   // 1 x HashMap - coprocessorServiceHandlers
7322   // 6 x Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
7323   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
7324   //   writeRequestsCount
7325   // 1 x HRegion$WriteState - writestate
7326   // 1 x RegionCoprocessorHost - coprocessorHost
7327   // 1 x RegionSplitPolicy - splitPolicy
7328   // 1 x MetricsRegion - metricsRegion
7329   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
7330   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
7331       ClassSize.OBJECT + // closeLock
7332       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
7333       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
7334       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
7335       WriteState.HEAP_SIZE + // writestate
7336       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
7337       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
7338       MultiVersionConsistencyControl.FIXED_SIZE // mvcc
7339       + ClassSize.TREEMAP // maxSeqIdInStores
7340       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
7341       ;
7342 
7343   @Override
7344   public long heapSize() {
7345     long heapSize = DEEP_OVERHEAD;
7346     for (Store store : this.stores.values()) {
7347       heapSize += store.heapSize();
7348     }
7349     // this does not take into account row locks, recent flushes, mvcc entries, and more
7350     return heapSize;
7351   }
7352 
7353   /*
7354    * This method calls System.exit.
7355    * @param message Message to print out.  May be null.
7356    */
7357   private static void printUsageAndExit(final String message) {
7358     if (message != null && message.length() > 0) System.out.println(message);
7359     System.out.println("Usage: HRegion CATALOG_TABLE_DIR [major_compact]");
7360     System.out.println("Options:");
7361     System.out.println(" major_compact  Pass this option to major compact " +
7362       "passed region.");
7363     System.out.println("Default outputs scan of passed region.");
7364     System.exit(1);
7365   }
7366 
7367   /**
7368    * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to
7369    * be available for handling
7370    * {@link HRegion#execService(com.google.protobuf.RpcController,
7371    *    org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall)}} calls.
7372    *
7373    * <p>
7374    * Only a single instance may be registered per region for a given {@link Service} subclass (the
7375    * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
7376    * After the first registration, subsequent calls with the same service name will fail with
7377    * a return value of {@code false}.
7378    * </p>
7379    * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
7380    * @return {@code true} if the registration was successful, {@code false}
7381    * otherwise
7382    */
7383   public boolean registerService(Service instance) {
7384     /*
7385      * No stacking of instances is allowed for a single service name
7386      */
7387     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
7388     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
7389       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
7390           " already registered, rejecting request from "+instance
7391       );
7392       return false;
7393     }
7394 
7395     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
7396     if (LOG.isDebugEnabled()) {
7397       LOG.debug("Registered coprocessor service: region="+
7398           Bytes.toStringBinary(getRegionName())+" service="+serviceDesc.getFullName());
7399     }
7400     return true;
7401   }
7402 
7403   /**
7404    * Executes a single protocol buffer coprocessor endpoint {@link Service} method using
7405    * the registered protocol handlers.  {@link Service} implementations must be registered via the
7406    * {@link HRegion#registerService(com.google.protobuf.Service)}
7407    * method before they are available.
7408    *
7409    * @param controller an {@code RpcController} implementation to pass to the invoked service
7410    * @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
7411    *     and parameters for the method invocation
7412    * @return a protocol buffer {@code Message} instance containing the method's result
7413    * @throws IOException if no registered service handler is found or an error
7414    *     occurs during the invocation
7415    * @see org.apache.hadoop.hbase.regionserver.HRegion#registerService(com.google.protobuf.Service)
7416    */
7417   public Message execService(RpcController controller, CoprocessorServiceCall call)
7418       throws IOException {
7419     String serviceName = call.getServiceName();
7420     String methodName = call.getMethodName();
7421     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
7422       throw new UnknownProtocolException(null,
7423           "No registered coprocessor service found for name "+serviceName+
7424           " in region "+Bytes.toStringBinary(getRegionName()));
7425     }
7426 
7427     Service service = coprocessorServiceHandlers.get(serviceName);
7428     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
7429     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
7430     if (methodDesc == null) {
7431       throw new UnknownProtocolException(service.getClass(),
7432           "Unknown method "+methodName+" called on service "+serviceName+
7433               " in region "+Bytes.toStringBinary(getRegionName()));
7434     }
7435 
7436     Message request = service.getRequestPrototype(methodDesc).newBuilderForType()
7437         .mergeFrom(call.getRequest()).build();
7438 
7439     if (coprocessorHost != null) {
7440       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
7441     }
7442 
7443     final Message.Builder responseBuilder =
7444         service.getResponsePrototype(methodDesc).newBuilderForType();
7445     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
7446       @Override
7447       public void run(Message message) {
7448         if (message != null) {
7449           responseBuilder.mergeFrom(message);
7450         }
7451       }
7452     });
7453 
7454     if (coprocessorHost != null) {
7455       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
7456     }
7457 
7458     return responseBuilder.build();
7459   }
7460 
7461   /*
7462    * Process table.
7463    * Do major compaction or list content.
7464    * @throws IOException
7465    */
7466   private static void processTable(final FileSystem fs, final Path p,
7467       final WALFactory walFactory, final Configuration c,
7468       final boolean majorCompact)
7469   throws IOException {
7470     HRegion region;
7471     FSTableDescriptors fst = new FSTableDescriptors(c);
7472     // Currently expects tables have one region only.
7473     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
7474       final WAL wal = walFactory.getMetaWAL(
7475           HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes());
7476       region = HRegion.newHRegion(p, wal, fs, c,
7477         HRegionInfo.FIRST_META_REGIONINFO,
7478           fst.get(TableName.META_TABLE_NAME), null);
7479     } else {
7480       throw new IOException("Not a known catalog table: " + p.toString());
7481     }
7482     try {
7483       region.initialize(null);
7484       if (majorCompact) {
7485         region.compactStores(true);
7486       } else {
7487         // Default behavior
7488         Scan scan = new Scan();
7489         // scan.addFamily(HConstants.CATALOG_FAMILY);
7490         RegionScanner scanner = region.getScanner(scan);
7491         try {
7492           List<Cell> kvs = new ArrayList<Cell>();
7493           boolean done;
7494           do {
7495             kvs.clear();
7496             done = NextState.hasMoreValues(scanner.next(kvs));
7497             if (kvs.size() > 0) LOG.info(kvs);
7498           } while (done);
7499         } finally {
7500           scanner.close();
7501         }
7502       }
7503     } finally {
7504       region.close();
7505     }
7506   }
7507 
7508   boolean shouldForceSplit() {
7509     return this.splitRequest;
7510   }
7511 
7512   byte[] getExplicitSplitPoint() {
7513     return this.explicitSplitPoint;
7514   }
7515 
7516   void forceSplit(byte[] sp) {
7517     // This HRegion will go away after the forced split is successful
7518     // But if a forced split fails, we need to clear forced split.
7519     this.splitRequest = true;
7520     if (sp != null) {
7521       this.explicitSplitPoint = sp;
7522     }
7523   }
7524 
7525   void clearSplit() {
7526     this.splitRequest = false;
7527     this.explicitSplitPoint = null;
7528   }
7529 
7530   /**
7531    * Give the region a chance to prepare before it is split.
7532    */
7533   protected void prepareToSplit() {
7534     // nothing
7535   }
7536 
7537   /**
7538    * Return the splitpoint. null indicates the region isn't splittable
7539    * If the splitpoint isn't explicitly specified, it will go over the stores
7540    * to find the best splitpoint. Currently the criteria of best splitpoint
7541    * is based on the size of the store.
7542    */
7543   public byte[] checkSplit() {
7544     // Can't split META
7545     if (this.getRegionInfo().isMetaTable() ||
7546         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
7547       if (shouldForceSplit()) {
7548         LOG.warn("Cannot split meta region in HBase 0.20 and above");
7549       }
7550       return null;
7551     }
7552 
7553     // Can't split region which is in recovering state
7554     if (this.isRecovering()) {
7555       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
7556       return null;
7557     }
7558 
7559     if (!splitPolicy.shouldSplit()) {
7560       return null;
7561     }
7562 
7563     byte[] ret = splitPolicy.getSplitPoint();
7564 
7565     if (ret != null) {
7566       try {
7567         checkRow(ret, "calculated split");
7568       } catch (IOException e) {
7569         LOG.error("Ignoring invalid split", e);
7570         return null;
7571       }
7572     }
7573     return ret;
7574   }
7575 
7576   /**
7577    * @return The priority that this region should have in the compaction queue
7578    */
7579   public int getCompactPriority() {
7580     int count = Integer.MAX_VALUE;
7581     for (Store store : stores.values()) {
7582       count = Math.min(count, store.getCompactPriority());
7583     }
7584     return count;
7585   }
7586 
7587 
7588   /** @return the coprocessor host */
7589   public RegionCoprocessorHost getCoprocessorHost() {
7590     return coprocessorHost;
7591   }
7592 
7593   /** @param coprocessorHost the new coprocessor host */
7594   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
7595     this.coprocessorHost = coprocessorHost;
7596   }
7597 
7598   /**
7599    * This method needs to be called before any public call that reads or
7600    * modifies data. It has to be called just before a try.
7601    * #closeRegionOperation needs to be called in the try's finally block
7602    * Acquires a read lock and checks if the region is closing or closed.
7603    * @throws IOException
7604    */
7605   public void startRegionOperation() throws IOException {
7606     startRegionOperation(Operation.ANY);
7607   }
7608 
7609   /**
7610    * @param op The operation is about to be taken on the region
7611    * @throws IOException
7612    */
7613   protected void startRegionOperation(Operation op) throws IOException {
7614     switch (op) {
7615     case GET:  // read operations
7616     case SCAN:
7617       checkReadsEnabled();
7618     case INCREMENT: // write operations
7619     case APPEND:
7620     case SPLIT_REGION:
7621     case MERGE_REGION:
7622     case PUT:
7623     case DELETE:
7624     case BATCH_MUTATE:
7625     case COMPACT_REGION:
7626       // when a region is in recovering state, no read, split or merge is allowed
7627       if (isRecovering() && (this.disallowWritesInRecovering ||
7628               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
7629         throw new RegionInRecoveryException(this.getRegionNameAsString() +
7630           " is recovering; cannot take reads");
7631       }
7632       break;
7633     default:
7634       break;
7635     }
7636     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
7637         || op == Operation.COMPACT_REGION) {
7638       // split, merge or compact region doesn't need to check the closing/closed state or lock the
7639       // region
7640       return;
7641     }
7642     if (this.closing.get()) {
7643       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
7644     }
7645     lock(lock.readLock());
7646     if (this.closed.get()) {
7647       lock.readLock().unlock();
7648       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
7649     }
7650     try {
7651       if (coprocessorHost != null) {
7652         coprocessorHost.postStartRegionOperation(op);
7653       }
7654     } catch (Exception e) {
7655       lock.readLock().unlock();
7656       throw new IOException(e);
7657     }
7658   }
7659 
7660   /**
7661    * Closes the lock. This needs to be called in the finally block corresponding
7662    * to the try block of #startRegionOperation
7663    * @throws IOException
7664    */
7665   public void closeRegionOperation() throws IOException {
7666     closeRegionOperation(Operation.ANY);
7667   }
7668 
7669   /**
7670    * Closes the lock. This needs to be called in the finally block corresponding
7671    * to the try block of {@link #startRegionOperation(Operation)}
7672    * @throws IOException
7673    */
7674   public void closeRegionOperation(Operation operation) throws IOException {
7675     lock.readLock().unlock();
7676     if (coprocessorHost != null) {
7677       coprocessorHost.postCloseRegionOperation(operation);
7678     }
7679   }
7680 
7681   /**
7682    * This method needs to be called before any public call that reads or
7683    * modifies stores in bulk. It has to be called just before a try.
7684    * #closeBulkRegionOperation needs to be called in the try's finally block
7685    * Acquires a writelock and checks if the region is closing or closed.
7686    * @throws NotServingRegionException when the region is closing or closed
7687    * @throws RegionTooBusyException if failed to get the lock in time
7688    * @throws InterruptedIOException if interrupted while waiting for a lock
7689    */
7690   private void startBulkRegionOperation(boolean writeLockNeeded)
7691       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
7692     if (this.closing.get()) {
7693       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
7694     }
7695     if (writeLockNeeded) lock(lock.writeLock());
7696     else lock(lock.readLock());
7697     if (this.closed.get()) {
7698       if (writeLockNeeded) lock.writeLock().unlock();
7699       else lock.readLock().unlock();
7700       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
7701     }
7702   }
7703 
7704   /**
7705    * Closes the lock. This needs to be called in the finally block corresponding
7706    * to the try block of #startRegionOperation
7707    */
7708   private void closeBulkRegionOperation(){
7709     if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
7710     else lock.readLock().unlock();
7711   }
7712 
7713   /**
7714    * Update counters for numer of puts without wal and the size of possible data loss.
7715    * These information are exposed by the region server metrics.
7716    */
7717   private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
7718     numMutationsWithoutWAL.increment();
7719     if (numMutationsWithoutWAL.get() <= 1) {
7720       LOG.info("writing data to region " + this +
7721                " with WAL disabled. Data may be lost in the event of a crash.");
7722     }
7723 
7724     long mutationSize = 0;
7725     for (List<Cell> cells: familyMap.values()) {
7726       assert cells instanceof RandomAccess;
7727       int listSize = cells.size();
7728       for (int i=0; i < listSize; i++) {
7729         Cell cell = cells.get(i);
7730         // TODO we need include tags length also here.
7731         mutationSize += KeyValueUtil.keyLength(cell) + cell.getValueLength();
7732       }
7733     }
7734 
7735     dataInMemoryWithoutWAL.add(mutationSize);
7736   }
7737 
7738   private void lock(final Lock lock)
7739       throws RegionTooBusyException, InterruptedIOException {
7740     lock(lock, 1);
7741   }
7742 
7743   /**
7744    * Try to acquire a lock.  Throw RegionTooBusyException
7745    * if failed to get the lock in time. Throw InterruptedIOException
7746    * if interrupted while waiting for the lock.
7747    */
7748   private void lock(final Lock lock, final int multiplier)
7749       throws RegionTooBusyException, InterruptedIOException {
7750     try {
7751       final long waitTime = Math.min(maxBusyWaitDuration,
7752           busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
7753       if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
7754         throw new RegionTooBusyException(
7755             "failed to get a lock in " + waitTime + " ms. " +
7756                 "regionName=" + (this.getRegionInfo() == null ? "unknown" :
7757                 this.getRegionInfo().getRegionNameAsString()) +
7758                 ", server=" + (this.getRegionServerServices() == null ? "unknown" :
7759                 this.getRegionServerServices().getServerName()));