View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.regionserver;
19  
20  import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL;
21  
22  import com.google.common.annotations.VisibleForTesting;
23  import com.google.common.base.Optional;
24  import com.google.common.base.Preconditions;
25  import com.google.common.collect.Lists;
26  import com.google.common.collect.Maps;
27  import com.google.common.io.Closeables;
28  import com.google.protobuf.ByteString;
29  import com.google.protobuf.Descriptors;
30  import com.google.protobuf.Message;
31  import com.google.protobuf.RpcCallback;
32  import com.google.protobuf.RpcController;
33  import com.google.protobuf.Service;
34  import com.google.protobuf.TextFormat;
35  import java.io.EOFException;
36  import java.io.FileNotFoundException;
37  import java.io.IOException;
38  import java.io.InterruptedIOException;
39  import java.lang.reflect.Constructor;
40  import java.text.ParseException;
41  import java.util.AbstractList;
42  import java.util.ArrayList;
43  import java.util.Arrays;
44  import java.util.Collection;
45  import java.util.Collections;
46  import java.util.Comparator;
47  import java.util.HashMap;
48  import java.util.HashSet;
49  import java.util.Iterator;
50  import java.util.List;
51  import java.util.Map;
52  import java.util.Map.Entry;
53  import java.util.NavigableMap;
54  import java.util.NavigableSet;
55  import java.util.RandomAccess;
56  import java.util.Set;
57  import java.util.TreeMap;
58  import java.util.UUID;
59  import java.util.concurrent.Callable;
60  import java.util.concurrent.CompletionService;
61  import java.util.concurrent.ConcurrentHashMap;
62  import java.util.concurrent.ConcurrentMap;
63  import java.util.concurrent.ConcurrentSkipListMap;
64  import java.util.concurrent.ExecutionException;
65  import java.util.concurrent.ExecutorCompletionService;
66  import java.util.concurrent.ExecutorService;
67  import java.util.concurrent.Executors;
68  import java.util.concurrent.Future;
69  import java.util.concurrent.FutureTask;
70  import java.util.concurrent.ThreadFactory;
71  import java.util.concurrent.ThreadPoolExecutor;
72  import java.util.concurrent.TimeUnit;
73  import java.util.concurrent.TimeoutException;
74  import java.util.concurrent.atomic.AtomicBoolean;
75  import java.util.concurrent.atomic.AtomicInteger;
76  import java.util.concurrent.atomic.AtomicLong;
77  import java.util.concurrent.locks.Lock;
78  import java.util.concurrent.locks.ReadWriteLock;
79  import java.util.concurrent.locks.ReentrantReadWriteLock;
80
81  import org.apache.commons.logging.Log;
82  import org.apache.commons.logging.LogFactory;
83  import org.apache.hadoop.conf.Configuration;
84  import org.apache.hadoop.fs.FileStatus;
85  import org.apache.hadoop.fs.FileSystem;
86  import org.apache.hadoop.fs.Path;
87  import org.apache.hadoop.hbase.Cell;
88  import org.apache.hadoop.hbase.CellComparator;
89  import org.apache.hadoop.hbase.CellScanner;
90  import org.apache.hadoop.hbase.CellUtil;
91  import org.apache.hadoop.hbase.CompoundConfiguration;
92  import org.apache.hadoop.hbase.DoNotRetryIOException;
93  import org.apache.hadoop.hbase.DroppedSnapshotException;
94  import org.apache.hadoop.hbase.HColumnDescriptor;
95  import org.apache.hadoop.hbase.HConstants;
96  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
97  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
98  import org.apache.hadoop.hbase.HRegionInfo;
99  import org.apache.hadoop.hbase.HTableDescriptor;
100 import org.apache.hadoop.hbase.KeyValue;
101 import org.apache.hadoop.hbase.KeyValueUtil;
102 import org.apache.hadoop.hbase.NamespaceDescriptor;
103 import org.apache.hadoop.hbase.NotServingRegionException;
104 import org.apache.hadoop.hbase.RegionTooBusyException;
105 import org.apache.hadoop.hbase.TableName;
106 import org.apache.hadoop.hbase.Tag;
107 import org.apache.hadoop.hbase.TagRewriteCell;
108 import org.apache.hadoop.hbase.TagUtil;
109 import org.apache.hadoop.hbase.UnknownScannerException;
110 import org.apache.hadoop.hbase.backup.HFileArchiver;
111 import org.apache.hadoop.hbase.classification.InterfaceAudience;
112 import org.apache.hadoop.hbase.client.Append;
113 import org.apache.hadoop.hbase.client.Delete;
114 import org.apache.hadoop.hbase.client.Durability;
115 import org.apache.hadoop.hbase.client.Get;
116 import org.apache.hadoop.hbase.client.Increment;
117 import org.apache.hadoop.hbase.client.IsolationLevel;
118 import org.apache.hadoop.hbase.client.Mutation;
119 import org.apache.hadoop.hbase.client.Put;
120 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
121 import org.apache.hadoop.hbase.client.Result;
122 import org.apache.hadoop.hbase.client.RowMutations;
123 import org.apache.hadoop.hbase.client.Scan;
124 import org.apache.hadoop.hbase.conf.ConfigurationManager;
125 import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
126 import org.apache.hadoop.hbase.coprocessor.RegionObserver.MutationType;
127 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
128 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
129 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
130 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
131 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
132 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
133 import org.apache.hadoop.hbase.filter.FilterWrapper;
134 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
135 import org.apache.hadoop.hbase.io.HeapSize;
136 import org.apache.hadoop.hbase.io.TimeRange;
137 import org.apache.hadoop.hbase.io.hfile.HFile;
138 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
139 import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
140 import org.apache.hadoop.hbase.ipc.RpcCallContext;
141 import org.apache.hadoop.hbase.ipc.RpcServer;
142 import org.apache.hadoop.hbase.mob.MobUtils;
143 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
144 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
145 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
146 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
147 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
148 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
149 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
150 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
151 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
152 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
153 import org.apache.hadoop.hbase.protobuf.generated.WALProtos;
154 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
155 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor;
156 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
157 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
158 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor;
159 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
160 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.StoreDescriptor;
161 import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry;
162 import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
163 import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
164 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
165 import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory;
166 import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
167 import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
168 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
169 import org.apache.hadoop.hbase.regionserver.wal.ReplayHLogKey;
170 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
171 import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
172 import org.apache.hadoop.hbase.security.User;
173 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
174 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
175 import org.apache.hadoop.hbase.util.ByteStringer;
176 import org.apache.hadoop.hbase.util.Bytes;
177 import org.apache.hadoop.hbase.util.CancelableProgressable;
178 import org.apache.hadoop.hbase.util.ClassSize;
179 import org.apache.hadoop.hbase.util.CompressionTest;
180 import org.apache.hadoop.hbase.util.Counter;
181 import org.apache.hadoop.hbase.util.EncryptionTest;
182 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
183 import org.apache.hadoop.hbase.util.FSUtils;
184 import org.apache.hadoop.hbase.util.HashedBytes;
185 import org.apache.hadoop.hbase.util.Pair;
186 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
187 import org.apache.hadoop.hbase.util.Threads;
188 import org.apache.hadoop.hbase.wal.WAL;
189 import org.apache.hadoop.hbase.wal.WALFactory;
190 import org.apache.hadoop.hbase.wal.WALKey;
191 import org.apache.hadoop.hbase.wal.WALSplitter;
192 import org.apache.hadoop.hbase.wal.WALSplitter.MutationReplay;
193 import org.apache.hadoop.io.MultipleIOException;
194 import org.apache.hadoop.util.StringUtils;
195 import org.apache.htrace.Trace;
196 import org.apache.htrace.TraceScope;
197
198
199 @SuppressWarnings("deprecation")
200 @InterfaceAudience.Private
201 public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
202   private static final Log LOG = LogFactory.getLog(HRegion.class);
203
204   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
205     "hbase.hregion.scan.loadColumnFamiliesOnDemand";
206
207   /**
208    * This is the global default value for durability. All tables/mutations not
209    * defining a durability or using USE_DEFAULT will default to this value.
210    */
211   private static final Durability DEFAULT_DURABILITY = Durability.SYNC_WAL;
212
213   final AtomicBoolean closed = new AtomicBoolean(false);
214
215   /* Closing can take some time; use the closing flag if there is stuff we don't
216    * want to do while in closing state; e.g. like offer this region up to the
217    * master as a region to close if the carrying regionserver is overloaded.
218    * Once set, it is never cleared.
219    */
220   final AtomicBoolean closing = new AtomicBoolean(false);
221
222   /**
223    * The max sequence id of flushed data on this region. There is no edit in memory that is
224    * less that this sequence id.
225    */
226   private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
227
228   /**
229    * Record the sequence id of last flush operation. Can be in advance of
230    * {@link #maxFlushedSeqId} when flushing a single column family. In this case,
231    * {@link #maxFlushedSeqId} will be older than the oldest edit in memory.
232    */
233   private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
234
235   /**
236    * The sequence id of the last replayed open region event from the primary region. This is used
237    * to skip entries before this due to the possibility of replay edits coming out of order from
238    * replication.
239    */
240   protected volatile long lastReplayedOpenRegionSeqId = -1L;
241   protected volatile long lastReplayedCompactionSeqId = -1L;
242
243   //////////////////////////////////////////////////////////////////////////////
244   // Members
245   //////////////////////////////////////////////////////////////////////////////
246
247   // map from a locked row to the context for that lock including:
248   // - CountDownLatch for threads waiting on that row
249   // - the thread that owns the lock (allow reentrancy)
250   // - reference count of (reentrant) locks held by the thread
251   // - the row itself
252   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
253       new ConcurrentHashMap<HashedBytes, RowLockContext>();
254
255   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
256       Bytes.BYTES_RAWCOMPARATOR);
257
258   // TODO: account for each registered handler in HeapSize computation
259   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
260
261   private final AtomicLong memstoreSize = new AtomicLong(0);
262   private final RegionServicesForStores regionServicesForStores = new RegionServicesForStores(this);
263
264   // Debug possible data loss due to WAL off
265   final Counter numMutationsWithoutWAL = new Counter();
266   final Counter dataInMemoryWithoutWAL = new Counter();
267
268   // Debug why CAS operations are taking a while.
269   final Counter checkAndMutateChecksPassed = new Counter();
270   final Counter checkAndMutateChecksFailed = new Counter();
271
272   // Number of requests
273   final Counter readRequestsCount = new Counter();
274   final Counter filteredReadRequestsCount = new Counter();
275   final Counter writeRequestsCount = new Counter();
276
277   // Number of requests blocked by memstore size.
278   private final Counter blockedRequestsCount = new Counter();
279
280   // Compaction counters
281   final AtomicLong compactionsFinished = new AtomicLong(0L);
282   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
283   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
284
285   private final WAL wal;
286   private final HRegionFileSystem fs;
287   protected final Configuration conf;
288   private final Configuration baseConf;
289   private final int rowLockWaitDuration;
290   private CompactedHFilesDischarger compactedFileDischarger;
291   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
292
293   // The internal wait duration to acquire a lock before read/update
294   // from the region. It is not per row. The purpose of this wait time
295   // is to avoid waiting a long time while the region is busy, so that
296   // we can release the IPC handler soon enough to improve the
297   // availability of the region server. It can be adjusted by
298   // tuning configuration "hbase.busy.wait.duration".
299   final long busyWaitDuration;
300   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
301
302   // If updating multiple rows in one call, wait longer,
303   // i.e. waiting for busyWaitDuration * # of rows. However,
304   // we can limit the max multiplier.
305   final int maxBusyWaitMultiplier;
306
307   // Max busy wait duration. There is no point to wait longer than the RPC
308   // purge timeout, when a RPC call will be terminated by the RPC engine.
309   final long maxBusyWaitDuration;
310
311   // negative number indicates infinite timeout
312   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
313   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
314
315   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
316
317   /**
318    * The sequence ID that was encountered when this region was opened.
319    */
320   private long openSeqNum = HConstants.NO_SEQNUM;
321
322   /**
323    * The default setting for whether to enable on-demand CF loading for
324    * scan requests to this region. Requests can override it.
325    */
326   private boolean isLoadingCfsOnDemandDefault = false;
327
328   private final AtomicInteger majorInProgress = new AtomicInteger(0);
329   private final AtomicInteger minorInProgress = new AtomicInteger(0);
330
331   //
332   // Context: During replay we want to ensure that we do not lose any data. So, we
333   // have to be conservative in how we replay wals. For each store, we calculate
334   // the maxSeqId up to which the store was flushed. And, skip the edits which
335   // are equal to or lower than maxSeqId for each store.
336   // The following map is populated when opening the region
337   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
338
339   /** Saved state from replaying prepare flush cache */
340   private PrepareFlushResult prepareFlushResult = null;
341
342   /**
343    * Config setting for whether to allow writes when a region is in recovering or not.
344    */
345   private boolean disallowWritesInRecovering = false;
346
347   // When a region is in recovering state, it can only accept writes not reads
348   private volatile boolean recovering = false;
349
350   private volatile Optional<ConfigurationManager> configurationManager;
351
352   /**
353    * @return The smallest mvcc readPoint across all the scanners in this
354    * region. Writes older than this readPoint, are included in every
355    * read operation.
356    */
357   public long getSmallestReadPoint() {
358     long minimumReadPoint;
359     // We need to ensure that while we are calculating the smallestReadPoint
360     // no new RegionScanners can grab a readPoint that we are unaware of.
361     // We achieve this by synchronizing on the scannerReadPoints object.
362     synchronized(scannerReadPoints) {
363       minimumReadPoint = mvcc.getReadPoint();
364       for (Long readPoint: this.scannerReadPoints.values()) {
365         if (readPoint < minimumReadPoint) {
366           minimumReadPoint = readPoint;
367         }
368       }
369     }
370     return minimumReadPoint;
371   }
372
373   /*
374    * Data structure of write state flags used coordinating flushes,
375    * compactions and closes.
376    */
377   static class WriteState {
378     // Set while a memstore flush is happening.
379     volatile boolean flushing = false;
380     // Set when a flush has been requested.
381     volatile boolean flushRequested = false;
382     // Number of compactions running.
383     AtomicInteger compacting = new AtomicInteger(0);
384     // Gets set in close. If set, cannot compact or flush again.
385     volatile boolean writesEnabled = true;
386     // Set if region is read-only
387     volatile boolean readOnly = false;
388     // whether the reads are enabled. This is different than readOnly, because readOnly is
389     // static in the lifetime of the region, while readsEnabled is dynamic
390     volatile boolean readsEnabled = true;
391
392     /**
393      * Set flags that make this region read-only.
394      *
395      * @param onOff flip value for region r/o setting
396      */
397     synchronized void setReadOnly(final boolean onOff) {
398       this.writesEnabled = !onOff;
399       this.readOnly = onOff;
400     }
401
402     boolean isReadOnly() {
403       return this.readOnly;
404     }
405
406     boolean isFlushRequested() {
407       return this.flushRequested;
408     }
409
410     void setReadsEnabled(boolean readsEnabled) {
411       this.readsEnabled = readsEnabled;
412     }
413
414     static final long HEAP_SIZE = ClassSize.align(
415         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
416   }
417
418   /**
419    * Objects from this class are created when flushing to describe all the different states that
420    * that method ends up in. The Result enum describes those states. The sequence id should only
421    * be specified if the flush was successful, and the failure message should only be specified
422    * if it didn't flush.
423    */
424   public static class FlushResultImpl implements FlushResult {
425     final Result result;
426     final String failureReason;
427     final long flushSequenceId;
428     final boolean wroteFlushWalMarker;
429
430     /**
431      * Convenience constructor to use when the flush is successful, the failure message is set to
432      * null.
433      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
434      * @param flushSequenceId Generated sequence id that comes right after the edits in the
435      *                        memstores.
436      */
437     FlushResultImpl(Result result, long flushSequenceId) {
438       this(result, flushSequenceId, null, false);
439       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
440           .FLUSHED_COMPACTION_NEEDED;
441     }
442
443     /**
444      * Convenience constructor to use when we cannot flush.
445      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
446      * @param failureReason Reason why we couldn't flush.
447      */
448     FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
449       this(result, -1, failureReason, wroteFlushMarker);
450       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
451     }
452
453     /**
454      * Constructor with all the parameters.
455      * @param result Any of the Result.
456      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
457      * @param failureReason Reason why we couldn't flush, or null.
458      */
459     FlushResultImpl(Result result, long flushSequenceId, String failureReason,
460       boolean wroteFlushMarker) {
461       this.result = result;
462       this.flushSequenceId = flushSequenceId;
463       this.failureReason = failureReason;
464       this.wroteFlushWalMarker = wroteFlushMarker;
465     }
466
467     /**
468      * Convenience method, the equivalent of checking if result is
469      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
470      * @return true if the memstores were flushed, else false.
471      */
472     @Override
473     public boolean isFlushSucceeded() {
474       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
475           .FLUSHED_COMPACTION_NEEDED;
476     }
477
478     /**
479      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
480      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
481      */
482     @Override
483     public boolean isCompactionNeeded() {
484       return result == Result.FLUSHED_COMPACTION_NEEDED;
485     }
486
487     @Override
488     public String toString() {
489       return new StringBuilder()
490         .append("flush result:").append(result).append(", ")
491         .append("failureReason:").append(failureReason).append(",")
492         .append("flush seq id").append(flushSequenceId).toString();
493     }
494
495     @Override
496     public Result getResult() {
497       return result;
498     }
499   }
500
501   /** A result object from prepare flush cache stage */
502   @VisibleForTesting
503   static class PrepareFlushResult {
504     final FlushResult result; // indicating a failure result from prepare
505     final TreeMap<byte[], StoreFlushContext> storeFlushCtxs;
506     final TreeMap<byte[], List<Path>> committedFiles;
507     final TreeMap<byte[], Long> storeFlushableSize;
508     final long startTime;
509     final long flushOpSeqId;
510     final long flushedSeqId;
511     final long totalFlushableSize;
512
513     /** Constructs an early exit case */
514     PrepareFlushResult(FlushResult result, long flushSeqId) {
515       this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, 0);
516     }
517
518     /** Constructs a successful prepare flush result */
519     PrepareFlushResult(
520       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
521       TreeMap<byte[], List<Path>> committedFiles,
522       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
523       long flushedSeqId, long totalFlushableSize) {
524       this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
525         flushSeqId, flushedSeqId, totalFlushableSize);
526     }
527
528     private PrepareFlushResult(
529       FlushResult result,
530       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
531       TreeMap<byte[], List<Path>> committedFiles,
532       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
533       long flushedSeqId, long totalFlushableSize) {
534       this.result = result;
535       this.storeFlushCtxs = storeFlushCtxs;
536       this.committedFiles = committedFiles;
537       this.storeFlushableSize = storeFlushableSize;
538       this.startTime = startTime;
539       this.flushOpSeqId = flushSeqId;
540       this.flushedSeqId = flushedSeqId;
541       this.totalFlushableSize = totalFlushableSize;
542     }
543
544     public FlushResult getResult() {
545       return this.result;
546     }
547   }
548
549   final WriteState writestate = new WriteState();
550
551   long memstoreFlushSize;
552   final long timestampSlop;
553   final long rowProcessorTimeout;
554
555   // Last flush time for each Store. Useful when we are flushing for each column
556   private final ConcurrentMap<Store, Long> lastStoreFlushTimeMap =
557       new ConcurrentHashMap<Store, Long>();
558
559   final RegionServerServices rsServices;
560   private RegionServerAccounting rsAccounting;
561   private long flushCheckInterval;
562   // flushPerChanges is to prevent too many changes in memstore
563   private long flushPerChanges;
564   private long blockingMemStoreSize;
565   final long threadWakeFrequency;
566   // Used to guard closes
567   final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
568
569   // Stop updates lock
570   private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock();
571   private boolean splitRequest;
572   private byte[] explicitSplitPoint = null;
573
574   private final MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
575
576   // Coprocessor host
577   private RegionCoprocessorHost coprocessorHost;
578
579   private HTableDescriptor htableDescriptor = null;
580   private RegionSplitPolicy splitPolicy;
581   private FlushPolicy flushPolicy;
582
583   private final MetricsRegion metricsRegion;
584   private final MetricsRegionWrapperImpl metricsRegionWrapper;
585   private final Durability durability;
586   private final boolean regionStatsEnabled;
587   // Stores the replication scope of the various column families of the table
588   // that has non-default scope
589   private final NavigableMap<byte[], Integer> replicationScope = new TreeMap<byte[], Integer>(
590       Bytes.BYTES_COMPARATOR);
591
592   /**
593    * HRegion constructor. This constructor should only be used for testing and
594    * extensions.  Instances of HRegion should be instantiated with the
595    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
596    *
597    * @param tableDir qualified path of directory where region should be located,
598    * usually the table directory.
599    * @param wal The WAL is the outbound log for any updates to the HRegion
600    * The wal file is a logfile from the previous execution that's
601    * custom-computed for this HRegion. The HRegionServer computes and sorts the
602    * appropriate wal info for this HRegion. If there is a previous wal file
603    * (implying that the HRegion has been written-to before), then read it from
604    * the supplied path.
605    * @param fs is the filesystem.
606    * @param confParam is global configuration settings.
607    * @param regionInfo - HRegionInfo that describes the region
608    * is new), then read them from the supplied path.
609    * @param htd the table descriptor
610    * @param rsServices reference to {@link RegionServerServices} or null
611    * @deprecated Use other constructors.
612    */
613   @Deprecated
614   @VisibleForTesting
615   public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
616       final Configuration confParam, final HRegionInfo regionInfo,
617       final HTableDescriptor htd, final RegionServerServices rsServices) {
618     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
619       wal, confParam, htd, rsServices);
620   }
621
622   /**
623    * HRegion constructor. This constructor should only be used for testing and
624    * extensions.  Instances of HRegion should be instantiated with the
625    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
626    *
627    * @param fs is the filesystem.
628    * @param wal The WAL is the outbound log for any updates to the HRegion
629    * The wal file is a logfile from the previous execution that's
630    * custom-computed for this HRegion. The HRegionServer computes and sorts the
631    * appropriate wal info for this HRegion. If there is a previous wal file
632    * (implying that the HRegion has been written-to before), then read it from
633    * the supplied path.
634    * @param confParam is global configuration settings.
635    * @param htd the table descriptor
636    * @param rsServices reference to {@link RegionServerServices} or null
637    */
638   public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
639       final HTableDescriptor htd, final RegionServerServices rsServices) {
640     if (htd == null) {
641       throw new IllegalArgumentException("Need table descriptor");
642     }
643
644     if (confParam instanceof CompoundConfiguration) {
645       throw new IllegalArgumentException("Need original base configuration");
646     }
647
648     this.wal = wal;
649     this.fs = fs;
650
651     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
652     this.baseConf = confParam;
653     this.conf = new CompoundConfiguration()
654       .add(confParam)
655       .addStringMap(htd.getConfiguration())
656       .addBytesMap(htd.getValues());
657     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
658         DEFAULT_CACHE_FLUSH_INTERVAL);
659     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
660     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
661       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
662           + MAX_FLUSH_PER_CHANGES);
663     }
664     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
665                     DEFAULT_ROWLOCK_WAIT_DURATION);
666
667     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
668     this.htableDescriptor = htd;
669     Set<byte[]> families = this.htableDescriptor.getFamiliesKeys();
670     for (byte[] family : families) {
671       if (!replicationScope.containsKey(family)) {
672         int scope = htd.getFamily(family).getScope();
673         // Only store those families that has NON-DEFAULT scope
674         if (scope != REPLICATION_SCOPE_LOCAL) {
675           // Do a copy before storing it here.
676           replicationScope.put(Bytes.copy(family), scope);
677         }
678       }
679     }
680     this.rsServices = rsServices;
681     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
682     setHTableSpecificConf();
683     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
684
685     this.busyWaitDuration = conf.getLong(
686       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
687     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
688     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
689       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
690         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
691         + maxBusyWaitMultiplier + "). Their product should be positive");
692     }
693     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
694       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
695
696     /*
697      * timestamp.slop provides a server-side constraint on the timestamp. This
698      * assumes that you base your TS around currentTimeMillis(). In this case,
699      * throw an error to the user if the user-specified TS is newer than now +
700      * slop. LATEST_TIMESTAMP == don't use this functionality
701      */
702     this.timestampSlop = conf.getLong(
703         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
704         HConstants.LATEST_TIMESTAMP);
705
706     /**
707      * Timeout for the process time in processRowsWithLocks().
708      * Use -1 to switch off time bound.
709      */
710     this.rowProcessorTimeout = conf.getLong(
711         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
712     this.durability = htd.getDurability() == Durability.USE_DEFAULT
713         ? DEFAULT_DURABILITY
714         : htd.getDurability();
715     if (rsServices != null) {
716       this.rsAccounting = this.rsServices.getRegionServerAccounting();
717       // don't initialize coprocessors if not running within a regionserver
718       // TODO: revisit if coprocessors should load in other cases
719       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
720       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
721       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
722
723       Map<String, Region> recoveringRegions = rsServices.getRecoveringRegions();
724       String encodedName = getRegionInfo().getEncodedName();
725       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
726         this.recovering = true;
727         recoveringRegions.put(encodedName, this);
728       }
729     } else {
730       this.metricsRegionWrapper = null;
731       this.metricsRegion = null;
732     }
733     if (LOG.isDebugEnabled()) {
734       // Write out region name as string and its encoded name.
735       LOG.debug("Instantiated " + this);
736     }
737
738     // by default, we allow writes against a region when it's in recovering
739     this.disallowWritesInRecovering =
740         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
741           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
742     configurationManager = Optional.absent();
743
744     // disable stats tracking system tables, but check the config for everything else
745     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
746         NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
747           false :
748           conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
749               HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
750   }
751
752   void setHTableSpecificConf() {
753     if (this.htableDescriptor == null) return;
754     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
755
756     if (flushSize <= 0) {
757       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
758         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
759     }
760     this.memstoreFlushSize = flushSize;
761     this.blockingMemStoreSize = this.memstoreFlushSize *
762         conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
763                 HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
764   }
765
766   /**
767    * Initialize this region.
768    * Used only by tests and SplitTransaction to reopen the region.
769    * You should use createHRegion() or openHRegion()
770    * @return What the next sequence (edit) id should be.
771    * @throws IOException e
772    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
773    */
774   @Deprecated
775   public long initialize() throws IOException {
776     return initialize(null);
777   }
778
779   /**
780    * Initialize this region.
781    *
782    * @param reporter Tickle every so often if initialize is taking a while.
783    * @return What the next sequence (edit) id should be.
784    * @throws IOException e
785    */
786   private long initialize(final CancelableProgressable reporter) throws IOException {
787
788     //Refuse to open the region if there is no column family in the table
789     if (htableDescriptor.getColumnFamilies().length == 0) {
790       throw new DoNotRetryIOException("Table " + htableDescriptor.getNameAsString() +
791           " should have at least one column family.");
792     }
793
794     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
795     long nextSeqId = -1;
796     try {
797       nextSeqId = initializeRegionInternals(reporter, status);
798       return nextSeqId;
799     } finally {
800       // nextSeqid will be -1 if the initialization fails.
801       // At least it will be 0 otherwise.
802       if (nextSeqId == -1) {
803         status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() +
804           " initialization.");
805       }
806     }
807   }
808
809   private long initializeRegionInternals(final CancelableProgressable reporter,
810       final MonitoredTask status) throws IOException {
811     if (coprocessorHost != null) {
812       status.setStatus("Running coprocessor pre-open hook");
813       coprocessorHost.preOpen();
814     }
815
816     // Write HRI to a file in case we need to recover hbase:meta
817     status.setStatus("Writing region info on filesystem");
818     fs.checkRegionInfoOnFilesystem();
819
820     // Initialize all the HStores
821     status.setStatus("Initializing all the Stores");
822     long maxSeqId = initializeStores(reporter, status);
823     this.mvcc.advanceTo(maxSeqId);
824     if (ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) {
825       // Recover any edits if available.
826       maxSeqId = Math.max(maxSeqId,
827         replayRecoveredEditsIfAny(this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
828       // Make sure mvcc is up to max.
829       this.mvcc.advanceTo(maxSeqId);
830     }
831     this.lastReplayedOpenRegionSeqId = maxSeqId;
832
833     this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
834     this.writestate.flushRequested = false;
835     this.writestate.compacting.set(0);
836
837     if (this.writestate.writesEnabled) {
838       // Remove temporary data left over from old regions
839       status.setStatus("Cleaning up temporary data from old regions");
840       fs.cleanupTempDir();
841     }
842
843     if (this.writestate.writesEnabled) {
844       status.setStatus("Cleaning up detritus from prior splits");
845       // Get rid of any splits or merges that were lost in-progress.  Clean out
846       // these directories here on open.  We may be opening a region that was
847       // being split but we crashed in the middle of it all.
848       fs.cleanupAnySplitDetritus();
849       fs.cleanupMergesDir();
850     }
851
852     // Initialize split policy
853     this.splitPolicy = RegionSplitPolicy.create(this, conf);
854
855     // Initialize flush policy
856     this.flushPolicy = FlushPolicyFactory.create(this, conf);
857
858     long lastFlushTime = EnvironmentEdgeManager.currentTime();
859     for (Store store: stores.values()) {
860       this.lastStoreFlushTimeMap.put(store, lastFlushTime);
861     }
862
863     // Use maximum of log sequenceid or that which was found in stores
864     // (particularly if no recovered edits, seqid will be -1).
865     long nextSeqid = maxSeqId;
866
867     // In distributedLogReplay mode, we don't know the last change sequence number because region
868     // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
869     // overlaps used sequence numbers
870     if (this.writestate.writesEnabled) {
871       nextSeqid = WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs
872           .getRegionDir(), nextSeqid, (this.recovering ? (this.flushPerChanges + 10000000) : 1));
873     } else {
874       nextSeqid++;
875     }
876
877     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
878       "; next sequenceid=" + nextSeqid);
879
880     // A region can be reopened if failed a split; reset flags
881     this.closing.set(false);
882     this.closed.set(false);
883
884     if (coprocessorHost != null) {
885       status.setStatus("Running coprocessor post-open hooks");
886       coprocessorHost.postOpen();
887     }
888
889     status.markComplete("Region opened successfully");
890     return nextSeqid;
891   }
892
893   /**
894    * Open all Stores.
895    * @param reporter
896    * @param status
897    * @return Highest sequenceId found out in a Store.
898    * @throws IOException
899    */
900   private long initializeStores(final CancelableProgressable reporter, MonitoredTask status)
901   throws IOException {
902     // Load in all the HStores.
903
904     long maxSeqId = -1;
905     // initialized to -1 so that we pick up MemstoreTS from column families
906     long maxMemstoreTS = -1;
907
908     if (!htableDescriptor.getFamilies().isEmpty()) {
909       // initialize the thread pool for opening stores in parallel.
910       ThreadPoolExecutor storeOpenerThreadPool =
911         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
912       CompletionService<HStore> completionService =
913         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
914
915       // initialize each store in parallel
916       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
917         status.setStatus("Instantiating store for column family " + family);
918         completionService.submit(new Callable<HStore>() {
919           @Override
920           public HStore call() throws IOException {
921             return instantiateHStore(family);
922           }
923         });
924       }
925       boolean allStoresOpened = false;
926       boolean hasSloppyStores = false;
927       try {
928         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
929           Future<HStore> future = completionService.take();
930           HStore store = future.get();
931           this.stores.put(store.getFamily().getName(), store);
932           MemStore memStore = store.getMemStore();
933           if(memStore != null && memStore.isSloppy()) {
934             hasSloppyStores = true;
935           }
936
937           long storeMaxSequenceId = store.getMaxSequenceId();
938           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
939               storeMaxSequenceId);
940           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
941             maxSeqId = storeMaxSequenceId;
942           }
943           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
944           if (maxStoreMemstoreTS > maxMemstoreTS) {
945             maxMemstoreTS = maxStoreMemstoreTS;
946           }
947         }
948         allStoresOpened = true;
949         if(hasSloppyStores) {
950           htableDescriptor.setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class
951               .getName());
952           LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this);
953         }
954       } catch (InterruptedException e) {
955         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
956       } catch (ExecutionException e) {
957         throw new IOException(e.getCause());
958       } finally {
959         storeOpenerThreadPool.shutdownNow();
960         if (!allStoresOpened) {
961           // something went wrong, close all opened stores
962           LOG.error("Could not initialize all stores for the region=" + this);
963           for (Store store : this.stores.values()) {
964             try {
965               store.close();
966             } catch (IOException e) {
967               LOG.warn(e.getMessage());
968             }
969           }
970         }
971       }
972     }
973     return Math.max(maxSeqId, maxMemstoreTS + 1);
974   }
975
976   private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
977     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
978     // Initialize all the HStores
979     status.setStatus("Warming up all the Stores");
980     try {
981       initializeStores(reporter, status);
982     } finally {
983       status.markComplete("Done warming up.");
984     }
985   }
986
987   /**
988    * @return Map of StoreFiles by column family
989    */
990   private NavigableMap<byte[], List<Path>> getStoreFiles() {
991     NavigableMap<byte[], List<Path>> allStoreFiles =
992       new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
993     for (Store store: getStores()) {
994       Collection<StoreFile> storeFiles = store.getStorefiles();
995       if (storeFiles == null) continue;
996       List<Path> storeFileNames = new ArrayList<Path>();
997       for (StoreFile storeFile: storeFiles) {
998         storeFileNames.add(storeFile.getPath());
999       }
1000       allStoreFiles.put(store.getFamily().getName(), storeFileNames);
1001     }
1002     return allStoreFiles;
1003   }
1004
1005   private void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
1006     Map<byte[], List<Path>> storeFiles = getStoreFiles();
1007     RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
1008       RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
1009       getRegionServerServices().getServerName(), storeFiles);
1010     WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc,
1011         mvcc);
1012   }
1013
1014   private void writeRegionCloseMarker(WAL wal) throws IOException {
1015     Map<byte[], List<Path>> storeFiles = getStoreFiles();
1016     RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
1017       RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(),
1018       getRegionServerServices().getServerName(), storeFiles);
1019     WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc,
1020         mvcc);
1021
1022     // Store SeqId in HDFS when a region closes
1023     // checking region folder exists is due to many tests which delete the table folder while a
1024     // table is still online
1025     if (this.fs.getFileSystem().exists(this.fs.getRegionDir())) {
1026       WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs.getRegionDir(),
1027         mvcc.getReadPoint(), 0);
1028     }
1029   }
1030
1031   /**
1032    * @return True if this region has references.
1033    */
1034   public boolean hasReferences() {
1035     for (Store store : this.stores.values()) {
1036       if (store.hasReferences()) return true;
1037     }
1038     return false;
1039   }
1040
1041   public void blockUpdates() {
1042     this.updatesLock.writeLock().lock();
1043   }
1044
1045   public void unblockUpdates() {
1046     this.updatesLock.writeLock().unlock();
1047   }
1048
1049   @Override
1050   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
1051     HDFSBlocksDistribution hdfsBlocksDistribution =
1052       new HDFSBlocksDistribution();
1053     synchronized (this.stores) {
1054       for (Store store : this.stores.values()) {
1055         Collection<StoreFile> storeFiles = store.getStorefiles();
1056         if (storeFiles == null) continue;
1057         for (StoreFile sf : storeFiles) {
1058           HDFSBlocksDistribution storeFileBlocksDistribution =
1059             sf.getHDFSBlockDistribution();
1060           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
1061         }
1062       }
1063     }
1064     return hdfsBlocksDistribution;
1065   }
1066
1067   /**
1068    * This is a helper function to compute HDFS block distribution on demand
1069    * @param conf configuration
1070    * @param tableDescriptor HTableDescriptor of the table
1071    * @param regionInfo encoded name of the region
1072    * @return The HDFS blocks distribution for the given region.
1073    * @throws IOException
1074    */
1075   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1076       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
1077     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
1078     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
1079   }
1080
1081   /**
1082    * This is a helper function to compute HDFS block distribution on demand
1083    * @param conf configuration
1084    * @param tableDescriptor HTableDescriptor of the table
1085    * @param regionInfo encoded name of the region
1086    * @param tablePath the table directory
1087    * @return The HDFS blocks distribution for the given region.
1088    * @throws IOException
1089    */
1090   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1091       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
1092       throws IOException {
1093     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1094     FileSystem fs = tablePath.getFileSystem(conf);
1095
1096     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
1097     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
1098       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
1099       if (storeFiles == null) continue;
1100       for (StoreFileInfo storeFileInfo : storeFiles) {
1101         try {
1102           hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
1103         } catch (IOException ioe) {
1104           LOG.warn("Error getting hdfs block distribution for " + storeFileInfo);
1105         }
1106       }
1107     }
1108     return hdfsBlocksDistribution;
1109   }
1110
1111   /**
1112    * Increase the size of mem store in this region and the size of global mem
1113    * store
1114    * @return the size of memstore in this region
1115    */
1116   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
1117     if (this.rsAccounting != null) {
1118       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
1119     }
1120     return this.memstoreSize.addAndGet(memStoreSize);
1121   }
1122
1123   @Override
1124   public HRegionInfo getRegionInfo() {
1125     return this.fs.getRegionInfo();
1126   }
1127
1128   /**
1129    * @return Instance of {@link RegionServerServices} used by this HRegion.
1130    * Can be null.
1131    */
1132   RegionServerServices getRegionServerServices() {
1133     return this.rsServices;
1134   }
1135
1136   @Override
1137   public long getReadRequestsCount() {
1138     return readRequestsCount.get();
1139   }
1140
1141   @Override
1142   public void updateReadRequestsCount(long i) {
1143     readRequestsCount.add(i);
1144   }
1145
1146   @Override
1147   public long getFilteredReadRequestsCount() {
1148     return filteredReadRequestsCount.get();
1149   }
1150
1151   @Override
1152   public long getWriteRequestsCount() {
1153     return writeRequestsCount.get();
1154   }
1155
1156   @Override
1157   public void updateWriteRequestsCount(long i) {
1158     writeRequestsCount.add(i);
1159   }
1160
1161   @Override
1162   public long getMemstoreSize() {
1163     return memstoreSize.get();
1164   }
1165
1166   @Override
1167   public RegionServicesForStores getRegionServicesForStores() {
1168     return regionServicesForStores;
1169   }
1170
1171   @Override
1172   public long getNumMutationsWithoutWAL() {
1173     return numMutationsWithoutWAL.get();
1174   }
1175
1176   @Override
1177   public long getDataInMemoryWithoutWAL() {
1178     return dataInMemoryWithoutWAL.get();
1179   }
1180
1181   @Override
1182   public long getBlockedRequestsCount() {
1183     return blockedRequestsCount.get();
1184   }
1185
1186   @Override
1187   public long getCheckAndMutateChecksPassed() {
1188     return checkAndMutateChecksPassed.get();
1189   }
1190
1191   @Override
1192   public long getCheckAndMutateChecksFailed() {
1193     return checkAndMutateChecksFailed.get();
1194   }
1195
1196   @Override
1197   public MetricsRegion getMetrics() {
1198     return metricsRegion;
1199   }
1200
1201   @Override
1202   public boolean isClosed() {
1203     return this.closed.get();
1204   }
1205
1206   @Override
1207   public boolean isClosing() {
1208     return this.closing.get();
1209   }
1210
1211   @Override
1212   public boolean isReadOnly() {
1213     return this.writestate.isReadOnly();
1214   }
1215
1216   /**
1217    * Reset recovering state of current region
1218    */
1219   public void setRecovering(boolean newState) {
1220     boolean wasRecovering = this.recovering;
1221     // Before we flip the recovering switch (enabling reads) we should write the region open
1222     // event to WAL if needed
1223     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
1224         && wasRecovering && !newState) {
1225
1226       // force a flush only if region replication is set up for this region. Otherwise no need.
1227       boolean forceFlush = getTableDesc().getRegionReplication() > 1;
1228
1229       MonitoredTask status = TaskMonitor.get().createStatus("Recovering region " + this);
1230
1231       try {
1232         // force a flush first
1233         if (forceFlush) {
1234           status.setStatus("Flushing region " + this + " because recovery is finished");
1235           internalFlushcache(status);
1236         }
1237
1238         status.setStatus("Writing region open event marker to WAL because recovery is finished");
1239         try {
1240           long seqId = openSeqNum;
1241           // obtain a new seqId because we possibly have writes and flushes on top of openSeqNum
1242           if (wal != null) {
1243             seqId = getNextSequenceId(wal);
1244           }
1245           writeRegionOpenMarker(wal, seqId);
1246         } catch (IOException e) {
1247           // We cannot rethrow this exception since we are being called from the zk thread. The
1248           // region has already opened. In this case we log the error, but continue
1249           LOG.warn(getRegionInfo().getEncodedName() + " : was not able to write region opening "
1250               + "event to WAL, continuing", e);
1251         }
1252       } catch (IOException ioe) {
1253         // Distributed log replay semantics does not necessarily require a flush, since the replayed
1254         // data is already written again in the WAL. So failed flush should be fine.
1255         LOG.warn(getRegionInfo().getEncodedName() + " : was not able to flush "
1256             + "event to WAL, continuing", ioe);
1257       } finally {
1258         status.cleanup();
1259       }
1260     }
1261
1262     this.recovering = newState;
1263     if (wasRecovering && !recovering) {
1264       // Call only when wal replay is over.
1265       coprocessorHost.postLogReplay();
1266     }
1267   }
1268
1269   @Override
1270   public boolean isRecovering() {
1271     return this.recovering;
1272   }
1273
1274   @Override
1275   public boolean isAvailable() {
1276     return !isClosed() && !isClosing();
1277   }
1278
1279   /** @return true if region is splittable */
1280   public boolean isSplittable() {
1281     return isAvailable() && !hasReferences();
1282   }
1283
1284   /**
1285    * @return true if region is mergeable
1286    */
1287   public boolean isMergeable() {
1288     if (!isAvailable()) {
1289       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1290           + " is not mergeable because it is closing or closed");
1291       return false;
1292     }
1293     if (hasReferences()) {
1294       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1295           + " is not mergeable because it has references");
1296       return false;
1297     }
1298
1299     return true;
1300   }
1301
1302   public boolean areWritesEnabled() {
1303     synchronized(this.writestate) {
1304       return this.writestate.writesEnabled;
1305     }
1306   }
1307
1308   @VisibleForTesting
1309   public MultiVersionConcurrencyControl getMVCC() {
1310     return mvcc;
1311   }
1312
1313   @Override
1314   public long getMaxFlushedSeqId() {
1315     return maxFlushedSeqId;
1316   }
1317
1318   @Override
1319   public long getReadPoint(IsolationLevel isolationLevel) {
1320     if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1321       // This scan can read even uncommitted transactions
1322       return Long.MAX_VALUE;
1323     }
1324     return mvcc.getReadPoint();
1325   }
1326
1327   @Override
1328   public long getReadpoint(IsolationLevel isolationLevel) {
1329     return getReadPoint(isolationLevel);
1330   }
1331
1332   @Override
1333   public boolean isLoadingCfsOnDemandDefault() {
1334     return this.isLoadingCfsOnDemandDefault;
1335   }
1336
1337   /**
1338    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1339    * service any more calls.
1340    *
1341    * <p>This method could take some time to execute, so don't call it from a
1342    * time-sensitive thread.
1343    *
1344    * @return Vector of all the storage files that the HRegion's component
1345    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1346    * vector if already closed and null if judged that it should not close.
1347    *
1348    * @throws IOException e
1349    * @throws DroppedSnapshotException Thrown when replay of wal is required
1350    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1351    * caller MUST abort after this.
1352    */
1353   public Map<byte[], List<StoreFile>> close() throws IOException {
1354     return close(false);
1355   }
1356
1357   private final Object closeLock = new Object();
1358
1359   /** Conf key for the periodic flush interval */
1360   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1361       "hbase.regionserver.optionalcacheflushinterval";
1362   /** Default interval for the memstore flush */
1363   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1364   /** Default interval for System tables memstore flush */
1365   public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
1366
1367   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1368   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1369       "hbase.regionserver.flush.per.changes";
1370   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1371   /**
1372    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1373    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1374    */
1375   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1376
1377   /**
1378    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1379    * Shut down each HStore, don't service any more calls.
1380    *
1381    * This method could take some time to execute, so don't call it from a
1382    * time-sensitive thread.
1383    *
1384    * @param abort true if server is aborting (only during testing)
1385    * @return Vector of all the storage files that the HRegion's component
1386    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1387    * we are not to close at this time or we are already closed.
1388    *
1389    * @throws IOException e
1390    * @throws DroppedSnapshotException Thrown when replay of wal is required
1391    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1392    * caller MUST abort after this.
1393    */
1394   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1395     // Only allow one thread to close at a time. Serialize them so dual
1396     // threads attempting to close will run up against each other.
1397     MonitoredTask status = TaskMonitor.get().createStatus(
1398         "Closing region " + this +
1399         (abort ? " due to abort" : ""));
1400
1401     status.setStatus("Waiting for close lock");
1402     try {
1403       synchronized (closeLock) {
1404         return doClose(abort, status);
1405       }
1406     } finally {
1407       status.cleanup();
1408     }
1409   }
1410
1411   /**
1412    * Exposed for some very specific unit tests.
1413    */
1414   @VisibleForTesting
1415   public void setClosing(boolean closing) {
1416     this.closing.set(closing);
1417   }
1418
1419   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK_EXCEPTION_PATH",
1420       justification="I think FindBugs is confused")
1421   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1422       throws IOException {
1423     if (isClosed()) {
1424       LOG.warn("Region " + this + " already closed");
1425       return null;
1426     }
1427
1428     if (coprocessorHost != null) {
1429       status.setStatus("Running coprocessor pre-close hooks");
1430       this.coprocessorHost.preClose(abort);
1431     }
1432
1433     status.setStatus("Disabling compacts and flushes for region");
1434     boolean canFlush = true;
1435     synchronized (writestate) {
1436       // Disable compacting and flushing by background threads for this
1437       // region.
1438       canFlush = !writestate.readOnly;
1439       writestate.writesEnabled = false;
1440       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1441       waitForFlushesAndCompactions();
1442     }
1443     // If we were not just flushing, is it worth doing a preflush...one
1444     // that will clear out of the bulk of the memstore before we put up
1445     // the close flag?
1446     if (!abort && worthPreFlushing() && canFlush) {
1447       status.setStatus("Pre-flushing region before close");
1448       LOG.info("Running close preflush of " + getRegionInfo().getRegionNameAsString());
1449       try {
1450         internalFlushcache(status);
1451       } catch (IOException ioe) {
1452         // Failed to flush the region. Keep going.
1453         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1454       }
1455     }
1456
1457     // block waiting for the lock for closing
1458     lock.writeLock().lock(); // FindBugs: Complains UL_UNRELEASED_LOCK_EXCEPTION_PATH but seems fine
1459     this.closing.set(true);
1460     status.setStatus("Disabling writes for close");
1461     try {
1462       if (this.isClosed()) {
1463         status.abort("Already got closed by another process");
1464         // SplitTransaction handles the null
1465         return null;
1466       }
1467       LOG.debug("Updates disabled for region " + this);
1468       // Don't flush the cache if we are aborting
1469       if (!abort && canFlush) {
1470         int failedfFlushCount = 0;
1471         int flushCount = 0;
1472         long tmp = 0;
1473         long remainingSize = this.memstoreSize.get();
1474         while (remainingSize > 0) {
1475           try {
1476             internalFlushcache(status);
1477             if(flushCount >0) {
1478               LOG.info("Running extra flush, " + flushCount +
1479                   " (carrying snapshot?) " + this);
1480             }
1481             flushCount++;
1482             tmp = this.memstoreSize.get();
1483             if (tmp >= remainingSize) {
1484               failedfFlushCount++;
1485             }
1486             remainingSize = tmp;
1487             if (failedfFlushCount > 5) {
1488               // If we failed 5 times and are unable to clear memory, abort
1489               // so we do not lose data
1490               throw new DroppedSnapshotException("Failed clearing memory after " +
1491                   flushCount + " attempts on region: " +
1492                   Bytes.toStringBinary(getRegionInfo().getRegionName()));
1493             }
1494           } catch (IOException ioe) {
1495             status.setStatus("Failed flush " + this + ", putting online again");
1496             synchronized (writestate) {
1497               writestate.writesEnabled = true;
1498             }
1499             // Have to throw to upper layers.  I can't abort server from here.
1500             throw ioe;
1501           }
1502         }
1503       }
1504
1505       Map<byte[], List<StoreFile>> result =
1506         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1507       if (!stores.isEmpty()) {
1508         // initialize the thread pool for closing stores in parallel.
1509         ThreadPoolExecutor storeCloserThreadPool =
1510           getStoreOpenAndCloseThreadPool("StoreCloserThread-" +
1511             getRegionInfo().getRegionNameAsString());
1512         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1513           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1514
1515         // close each store in parallel
1516         for (final Store store : stores.values()) {
1517           long flushableSize = store.getFlushableSize();
1518           if (!(abort || flushableSize == 0 || writestate.readOnly)) {
1519             if (getRegionServerServices() != null) {
1520               getRegionServerServices().abort("Assertion failed while closing store "
1521                 + getRegionInfo().getRegionNameAsString() + " " + store
1522                 + ". flushableSize expected=0, actual= " + flushableSize
1523                 + ". Current memstoreSize=" + getMemstoreSize() + ". Maybe a coprocessor "
1524                 + "operation failed and left the memstore in a partially updated state.", null);
1525             }
1526           }
1527           completionService
1528               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1529                 @Override
1530                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1531                   return new Pair<byte[], Collection<StoreFile>>(
1532                     store.getFamily().getName(), store.close());
1533                 }
1534               });
1535         }
1536         try {
1537           for (int i = 0; i < stores.size(); i++) {
1538             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1539             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1540             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1541             if (familyFiles == null) {
1542               familyFiles = new ArrayList<StoreFile>();
1543               result.put(storeFiles.getFirst(), familyFiles);
1544             }
1545             familyFiles.addAll(storeFiles.getSecond());
1546           }
1547         } catch (InterruptedException e) {
1548           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1549         } catch (ExecutionException e) {
1550           throw new IOException(e.getCause());
1551         } finally {
1552           storeCloserThreadPool.shutdownNow();
1553         }
1554       }
1555
1556       status.setStatus("Writing region close event to WAL");
1557       if (!abort && wal != null && getRegionServerServices() != null && !writestate.readOnly) {
1558         writeRegionCloseMarker(wal);
1559       }
1560
1561       this.closed.set(true);
1562       if (!canFlush) {
1563         addAndGetGlobalMemstoreSize(-memstoreSize.get());
1564       } else if (memstoreSize.get() != 0) {
1565         LOG.error("Memstore size is " + memstoreSize.get());
1566       }
1567       if (coprocessorHost != null) {
1568         status.setStatus("Running coprocessor post-close hooks");
1569         this.coprocessorHost.postClose(abort);
1570       }
1571       if (this.metricsRegion != null) {
1572         this.metricsRegion.close();
1573       }
1574       if (this.metricsRegionWrapper != null) {
1575         Closeables.closeQuietly(this.metricsRegionWrapper);
1576       }
1577       // stop the Compacted hfile discharger
1578       if (this.compactedFileDischarger != null) this.compactedFileDischarger.cancel(true);
1579
1580       status.markComplete("Closed");
1581       LOG.info("Closed " + this);
1582       return result;
1583     } finally {
1584       lock.writeLock().unlock();
1585     }
1586   }
1587
1588   @Override
1589   public void waitForFlushesAndCompactions() {
1590     synchronized (writestate) {
1591       if (this.writestate.readOnly) {
1592         // we should not wait for replayed flushed if we are read only (for example in case the
1593         // region is a secondary replica).
1594         return;
1595       }
1596       boolean interrupted = false;
1597       try {
1598         while (writestate.compacting.get() > 0 || writestate.flushing) {
1599           LOG.debug("waiting for " + writestate.compacting + " compactions"
1600             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1601           try {
1602             writestate.wait();
1603           } catch (InterruptedException iex) {
1604             // essentially ignore and propagate the interrupt back up
1605             LOG.warn("Interrupted while waiting");
1606             interrupted = true;
1607           }
1608         }
1609       } finally {
1610         if (interrupted) {
1611           Thread.currentThread().interrupt();
1612         }
1613       }
1614     }
1615   }
1616
1617   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1618       final String threadNamePrefix) {
1619     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1620     int maxThreads = Math.min(numStores,
1621         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1622             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1623     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1624   }
1625
1626   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1627       final String threadNamePrefix) {
1628     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1629     int maxThreads = Math.max(1,
1630         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1631             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1632             / numStores);
1633     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1634   }
1635
1636   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1637       final String threadNamePrefix) {
1638     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1639       new ThreadFactory() {
1640         private int count = 1;
1641
1642         @Override
1643         public Thread newThread(Runnable r) {
1644           return new Thread(r, threadNamePrefix + "-" + count++);
1645         }
1646       });
1647   }
1648
1649    /**
1650     * @return True if its worth doing a flush before we put up the close flag.
1651     */
1652   private boolean worthPreFlushing() {
1653     return this.memstoreSize.get() >
1654       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1655   }
1656
1657   //////////////////////////////////////////////////////////////////////////////
1658   // HRegion accessors
1659   //////////////////////////////////////////////////////////////////////////////
1660
1661   @Override
1662   public HTableDescriptor getTableDesc() {
1663     return this.htableDescriptor;
1664   }
1665
1666   /** @return WAL in use for this region */
1667   public WAL getWAL() {
1668     return this.wal;
1669   }
1670
1671   /**
1672    * @return split policy for this region.
1673    */
1674   public RegionSplitPolicy getSplitPolicy() {
1675     return this.splitPolicy;
1676   }
1677
1678   /**
1679    * A split takes the config from the parent region & passes it to the daughter
1680    * region's constructor. If 'conf' was passed, you would end up using the HTD
1681    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1682    * to the daughter regions to avoid this tricky dedupe problem.
1683    * @return Configuration object
1684    */
1685   Configuration getBaseConf() {
1686     return this.baseConf;
1687   }
1688
1689   /** @return {@link FileSystem} being used by this region */
1690   public FileSystem getFilesystem() {
1691     return fs.getFileSystem();
1692   }
1693
1694   /** @return the {@link HRegionFileSystem} used by this region */
1695   public HRegionFileSystem getRegionFileSystem() {
1696     return this.fs;
1697   }
1698
1699   @Override
1700   public long getEarliestFlushTimeForAllStores() {
1701     return Collections.min(lastStoreFlushTimeMap.values());
1702   }
1703
1704   @Override
1705   public long getOldestHfileTs(boolean majorCompactioOnly) throws IOException {
1706     long result = Long.MAX_VALUE;
1707     for (Store store : getStores()) {
1708       Collection<StoreFile> storeFiles = store.getStorefiles();
1709       if (storeFiles == null) continue;
1710       for (StoreFile file : storeFiles) {
1711         StoreFileReader sfReader = file.getReader();
1712         if (sfReader == null) continue;
1713         HFile.Reader reader = sfReader.getHFileReader();
1714         if (reader == null) continue;
1715         if (majorCompactioOnly) {
1716           byte[] val = reader.loadFileInfo().get(StoreFile.MAJOR_COMPACTION_KEY);
1717           if (val == null) continue;
1718           if (val == null || !Bytes.toBoolean(val)) {
1719             continue;
1720           }
1721         }
1722         result = Math.min(result, reader.getFileContext().getFileCreateTime());
1723       }
1724     }
1725     return result == Long.MAX_VALUE ? 0 : result;
1726   }
1727
1728   RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
1729     long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
1730     byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
1731     regionLoadBldr.clearStoreCompleteSequenceId();
1732     for (byte[] familyName : this.stores.keySet()) {
1733       long earliest = this.wal.getEarliestMemstoreSeqNum(encodedRegionName, familyName);
1734       // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will
1735       // give us a sequence id that is for sure flushed. We want edit replay to start after this
1736       // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id.
1737       long csid = (earliest == HConstants.NO_SEQNUM)? lastFlushOpSeqIdLocal: earliest - 1;
1738       regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.
1739         newBuilder().setFamilyName(ByteString.copyFrom(familyName)).setSequenceId(csid).build());
1740     }
1741     return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId());
1742   }
1743
1744   //////////////////////////////////////////////////////////////////////////////
1745   // HRegion maintenance.
1746   //
1747   // These methods are meant to be called periodically by the HRegionServer for
1748   // upkeep.
1749   //////////////////////////////////////////////////////////////////////////////
1750
1751   /** @return returns size of largest HStore. */
1752   public long getLargestHStoreSize() {
1753     long size = 0;
1754     for (Store h : stores.values()) {
1755       long storeSize = h.getSize();
1756       if (storeSize > size) {
1757         size = storeSize;
1758       }
1759     }
1760     return size;
1761   }
1762
1763   /*
1764    * Do preparation for pending compaction.
1765    * @throws IOException
1766    */
1767   protected void doRegionCompactionPrep() throws IOException {
1768   }
1769
1770   @Override
1771   public void triggerMajorCompaction() throws IOException {
1772     for (Store s : getStores()) {
1773       s.triggerMajorCompaction();
1774     }
1775   }
1776
1777   @Override
1778   public void compact(final boolean majorCompaction) throws IOException {
1779     if (majorCompaction) {
1780       triggerMajorCompaction();
1781     }
1782     for (Store s : getStores()) {
1783       CompactionContext compaction = s.requestCompaction();
1784       if (compaction != null) {
1785         ThroughputController controller = null;
1786         if (rsServices != null) {
1787           controller = CompactionThroughputControllerFactory.create(rsServices, conf);
1788         }
1789         if (controller == null) {
1790           controller = NoLimitThroughputController.INSTANCE;
1791         }
1792         compact(compaction, s, controller, null);
1793       }
1794     }
1795   }
1796
1797   /**
1798    * This is a helper function that compact all the stores synchronously
1799    * It is used by utilities and testing
1800    *
1801    * @throws IOException e
1802    */
1803   public void compactStores() throws IOException {
1804     for (Store s : getStores()) {
1805       CompactionContext compaction = s.requestCompaction();
1806       if (compaction != null) {
1807         compact(compaction, s, NoLimitThroughputController.INSTANCE, null);
1808       }
1809     }
1810   }
1811
1812   /**
1813    * This is a helper function that compact the given store
1814    * It is used by utilities and testing
1815    *
1816    * @throws IOException e
1817    */
1818   @VisibleForTesting
1819   void compactStore(byte[] family, ThroughputController throughputController)
1820       throws IOException {
1821     Store s = getStore(family);
1822     CompactionContext compaction = s.requestCompaction();
1823     if (compaction != null) {
1824       compact(compaction, s, throughputController, null);
1825     }
1826   }
1827
1828   /*
1829    * Called by compaction thread and after region is opened to compact the
1830    * HStores if necessary.
1831    *
1832    * <p>This operation could block for a long time, so don't call it from a
1833    * time-sensitive thread.
1834    *
1835    * Note that no locking is necessary at this level because compaction only
1836    * conflicts with a region split, and that cannot happen because the region
1837    * server does them sequentially and not in parallel.
1838    *
1839    * @param compaction Compaction details, obtained by requestCompaction()
1840    * @param throughputController
1841    * @return whether the compaction completed
1842    */
1843   public boolean compact(CompactionContext compaction, Store store,
1844       ThroughputController throughputController) throws IOException {
1845     return compact(compaction, store, throughputController, null);
1846   }
1847
1848   public boolean compact(CompactionContext compaction, Store store,
1849       ThroughputController throughputController, User user) throws IOException {
1850     assert compaction != null && compaction.hasSelection();
1851     assert !compaction.getRequest().getFiles().isEmpty();
1852     if (this.closing.get() || this.closed.get()) {
1853       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1854       store.cancelRequestedCompaction(compaction);
1855       return false;
1856     }
1857     MonitoredTask status = null;
1858     boolean requestNeedsCancellation = true;
1859     /*
1860      * We are trying to remove / relax the region read lock for compaction.
1861      * Let's see what are the potential race conditions among the operations (user scan,
1862      * region split, region close and region bulk load).
1863      *
1864      *  user scan ---> region read lock
1865      *  region split --> region close first --> region write lock
1866      *  region close --> region write lock
1867      *  region bulk load --> region write lock
1868      *
1869      * read lock is compatible with read lock. ---> no problem with user scan/read
1870      * region bulk load does not cause problem for compaction (no consistency problem, store lock
1871      *  will help the store file accounting).
1872      * They can run almost concurrently at the region level.
1873      *
1874      * The only remaining race condition is between the region close and compaction.
1875      * So we will evaluate, below, how region close intervenes with compaction if compaction does
1876      * not acquire region read lock.
1877      *
1878      * Here are the steps for compaction:
1879      * 1. obtain list of StoreFile's
1880      * 2. create StoreFileScanner's based on list from #1
1881      * 3. perform compaction and save resulting files under tmp dir
1882      * 4. swap in compacted files
1883      *
1884      * #1 is guarded by store lock. This patch does not change this --> no worse or better
1885      * For #2, we obtain smallest read point (for region) across all the Scanners (for both default
1886      * compactor and stripe compactor).
1887      * The read points are for user scans. Region keeps the read points for all currently open
1888      * user scanners.
1889      * Compaction needs to know the smallest read point so that during re-write of the hfiles,
1890      * it can remove the mvcc points for the cells if their mvccs are older than the smallest
1891      * since they are not needed anymore.
1892      * This will not conflict with compaction.
1893      * For #3, it can be performed in parallel to other operations.
1894      * For #4 bulk load and compaction don't conflict with each other on the region level
1895      *   (for multi-family atomicy).
1896      * Region close and compaction are guarded pretty well by the 'writestate'.
1897      * In HRegion#doClose(), we have :
1898      * synchronized (writestate) {
1899      *   // Disable compacting and flushing by background threads for this
1900      *   // region.
1901      *   canFlush = !writestate.readOnly;
1902      *   writestate.writesEnabled = false;
1903      *   LOG.debug("Closing " + this + ": disabling compactions & flushes");
1904      *   waitForFlushesAndCompactions();
1905      * }
1906      * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0.
1907      * and in HRegion.compact()
1908      *  try {
1909      *    synchronized (writestate) {
1910      *    if (writestate.writesEnabled) {
1911      *      wasStateSet = true;
1912      *      ++writestate.compacting;
1913      *    } else {
1914      *      String msg = "NOT compacting region " + this + ". Writes disabled.";
1915      *      LOG.info(msg);
1916      *      status.abort(msg);
1917      *      return false;
1918      *    }
1919      *  }
1920      * Also in compactor.performCompaction():
1921      * check periodically to see if a system stop is requested
1922      * if (closeCheckInterval > 0) {
1923      *   bytesWritten += len;
1924      *   if (bytesWritten > closeCheckInterval) {
1925      *     bytesWritten = 0;
1926      *     if (!store.areWritesEnabled()) {
1927      *       progress.cancel();
1928      *       return false;
1929      *     }
1930      *   }
1931      * }
1932      */
1933     try {
1934       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1935       if (stores.get(cf) != store) {
1936         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1937             + " has been re-instantiated, cancel this compaction request. "
1938             + " It may be caused by the roll back of split transaction");
1939         return false;
1940       }
1941
1942       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1943       if (this.closed.get()) {
1944         String msg = "Skipping compaction on " + this + " because closed";
1945         LOG.debug(msg);
1946         status.abort(msg);
1947         return false;
1948       }
1949       boolean wasStateSet = false;
1950       try {
1951         synchronized (writestate) {
1952           if (writestate.writesEnabled) {
1953             wasStateSet = true;
1954             writestate.compacting.incrementAndGet();
1955           } else {
1956             String msg = "NOT compacting region " + this + ". Writes disabled.";
1957             LOG.info(msg);
1958             status.abort(msg);
1959             return false;
1960           }
1961         }
1962         LOG.info("Starting compaction on " + store + " in region " + this
1963             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1964         doRegionCompactionPrep();
1965         try {
1966           status.setStatus("Compacting store " + store);
1967           // We no longer need to cancel the request on the way out of this
1968           // method because Store#compact will clean up unconditionally
1969           requestNeedsCancellation = false;
1970           store.compact(compaction, throughputController, user);
1971         } catch (InterruptedIOException iioe) {
1972           String msg = "compaction interrupted";
1973           LOG.info(msg, iioe);
1974           status.abort(msg);
1975           return false;
1976         }
1977       } finally {
1978         if (wasStateSet) {
1979           synchronized (writestate) {
1980             writestate.compacting.decrementAndGet();
1981             if (writestate.compacting.get() <= 0) {
1982               writestate.notifyAll();
1983             }
1984           }
1985         }
1986       }
1987       status.markComplete("Compaction complete");
1988       return true;
1989     } finally {
1990       if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
1991       if (status != null) status.cleanup();
1992     }
1993   }
1994
1995   @Override
1996   public FlushResult flush(boolean force) throws IOException {
1997     return flushcache(force, false);
1998   }
1999
2000   /**
2001    * Flush the cache.
2002    *
2003    * When this method is called the cache will be flushed unless:
2004    * <ol>
2005    *   <li>the cache is empty</li>
2006    *   <li>the region is closed.</li>
2007    *   <li>a flush is already in progress</li>
2008    *   <li>writes are disabled</li>
2009    * </ol>
2010    *
2011    * <p>This method may block for some time, so it should not be called from a
2012    * time-sensitive thread.
2013    * @param forceFlushAllStores whether we want to flush all stores
2014    * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
2015    * @return whether the flush is success and whether the region needs compacting
2016    *
2017    * @throws IOException general io exceptions
2018    * @throws DroppedSnapshotException Thrown when replay of wal is required
2019    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
2020    * caller MUST abort after this.
2021    */
2022   public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
2023       throws IOException {
2024     // fail-fast instead of waiting on the lock
2025     if (this.closing.get()) {
2026       String msg = "Skipping flush on " + this + " because closing";
2027       LOG.debug(msg);
2028       return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2029     }
2030     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
2031     status.setStatus("Acquiring readlock on region");
2032     // block waiting for the lock for flushing cache
2033     lock.readLock().lock();
2034     try {
2035       if (this.closed.get()) {
2036         String msg = "Skipping flush on " + this + " because closed";
2037         LOG.debug(msg);
2038         status.abort(msg);
2039         return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2040       }
2041       if (coprocessorHost != null) {
2042         status.setStatus("Running coprocessor pre-flush hooks");
2043         coprocessorHost.preFlush();
2044       }
2045       // TODO: this should be managed within memstore with the snapshot, updated only after flush
2046       // successful
2047       if (numMutationsWithoutWAL.get() > 0) {
2048         numMutationsWithoutWAL.set(0);
2049         dataInMemoryWithoutWAL.set(0);
2050       }
2051       synchronized (writestate) {
2052         if (!writestate.flushing && writestate.writesEnabled) {
2053           this.writestate.flushing = true;
2054         } else {
2055           if (LOG.isDebugEnabled()) {
2056             LOG.debug("NOT flushing memstore for region " + this
2057                 + ", flushing=" + writestate.flushing + ", writesEnabled="
2058                 + writestate.writesEnabled);
2059           }
2060           String msg = "Not flushing since "
2061               + (writestate.flushing ? "already flushing"
2062               : "writes not enabled");
2063           status.abort(msg);
2064           return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2065         }
2066       }
2067
2068       try {
2069         Collection<Store> specificStoresToFlush =
2070             forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
2071         FlushResult fs = internalFlushcache(specificStoresToFlush,
2072           status, writeFlushRequestWalMarker);
2073
2074         if (coprocessorHost != null) {
2075           status.setStatus("Running post-flush coprocessor hooks");
2076           coprocessorHost.postFlush();
2077         }
2078
2079         status.markComplete("Flush successful");
2080         return fs;
2081       } finally {
2082         synchronized (writestate) {
2083           writestate.flushing = false;
2084           this.writestate.flushRequested = false;
2085           writestate.notifyAll();
2086         }
2087       }
2088     } finally {
2089       lock.readLock().unlock();
2090       status.cleanup();
2091     }
2092   }
2093
2094   /**
2095    * Should the store be flushed because it is old enough.
2096    * <p>
2097    * Every FlushPolicy should call this to determine whether a store is old enough to flush (except
2098    * that you always flush all stores). Otherwise the method will always
2099    * returns true which will make a lot of flush requests.
2100    */
2101   boolean shouldFlushStore(Store store) {
2102     long earliest = this.wal.getEarliestMemstoreSeqNum(getRegionInfo().getEncodedNameAsBytes(),
2103       store.getFamily().getName()) - 1;
2104     if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) {
2105       if (LOG.isDebugEnabled()) {
2106         LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " +
2107           getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest +
2108           " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint());
2109       }
2110       return true;
2111     }
2112     if (this.flushCheckInterval <= 0) {
2113       return false;
2114     }
2115     long now = EnvironmentEdgeManager.currentTime();
2116     if (store.timeOfOldestEdit() < now - this.flushCheckInterval) {
2117       if (LOG.isDebugEnabled()) {
2118         LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " +
2119           getRegionInfo().getEncodedName() + " because time of oldest edit=" +
2120             store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now);
2121       }
2122       return true;
2123     }
2124     return false;
2125   }
2126
2127   /**
2128    * Should the memstore be flushed now
2129    */
2130   boolean shouldFlush(final StringBuffer whyFlush) {
2131     whyFlush.setLength(0);
2132     // This is a rough measure.
2133     if (this.maxFlushedSeqId > 0
2134           && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint())) {
2135       whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush");
2136       return true;
2137     }
2138     long modifiedFlushCheckInterval = flushCheckInterval;
2139     if (getRegionInfo().isSystemTable() &&
2140         getRegionInfo().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2141       modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL;
2142     }
2143     if (modifiedFlushCheckInterval <= 0) { //disabled
2144       return false;
2145     }
2146     long now = EnvironmentEdgeManager.currentTime();
2147     //if we flushed in the recent past, we don't need to do again now
2148     if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
2149       return false;
2150     }
2151     //since we didn't flush in the recent past, flush now if certain conditions
2152     //are met. Return true on first such memstore hit.
2153     for (Store s : getStores()) {
2154       if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
2155         // we have an old enough edit in the memstore, flush
2156         whyFlush.append(s.toString() + " has an old edit so flush to free WALs");
2157         return true;
2158       }
2159     }
2160     return false;
2161   }
2162
2163   /**
2164    * Flushing all stores.
2165    *
2166    * @see #internalFlushcache(Collection, MonitoredTask, boolean)
2167    */
2168   private FlushResult internalFlushcache(MonitoredTask status)
2169       throws IOException {
2170     return internalFlushcache(stores.values(), status, false);
2171   }
2172
2173   /**
2174    * Flushing given stores.
2175    *
2176    * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean)
2177    */
2178   private FlushResult internalFlushcache(final Collection<Store> storesToFlush,
2179       MonitoredTask status, boolean writeFlushWalMarker) throws IOException {
2180     return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush,
2181         status, writeFlushWalMarker);
2182   }
2183
2184   /**
2185    * Flush the memstore. Flushing the memstore is a little tricky. We have a lot
2186    * of updates in the memstore, all of which have also been written to the wal.
2187    * We need to write those updates in the memstore out to disk, while being
2188    * able to process reads/writes as much as possible during the flush
2189    * operation.
2190    * <p>
2191    * This method may block for some time. Every time you call it, we up the
2192    * regions sequence id even if we don't flush; i.e. the returned region id
2193    * will be at least one larger than the last edit applied to this region. The
2194    * returned id does not refer to an actual edit. The returned id can be used
2195    * for say installing a bulk loaded file just ahead of the last hfile that was
2196    * the result of this flush, etc.
2197    *
2198    * @param wal Null if we're NOT to go via wal.
2199    * @param myseqid The seqid to use if <code>wal</code> is null writing out flush file.
2200    * @param storesToFlush The list of stores to flush.
2201    * @return object describing the flush's state
2202    * @throws IOException general io exceptions
2203    * @throws DroppedSnapshotException Thrown when replay of WAL is required.
2204    */
2205   protected FlushResult internalFlushcache(final WAL wal, final long myseqid,
2206       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2207           throws IOException {
2208     PrepareFlushResult result
2209       = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker);
2210     if (result.result == null) {
2211       return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
2212     } else {
2213       return result.result; // early exit due to failure from prepare stage
2214     }
2215   }
2216
2217   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DLS_DEAD_LOCAL_STORE",
2218       justification="FindBugs seems confused about trxId")
2219   protected PrepareFlushResult internalPrepareFlushCache(final WAL wal, final long myseqid,
2220       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2221   throws IOException {
2222     if (this.rsServices != null && this.rsServices.isAborted()) {
2223       // Don't flush when server aborting, it's unsafe
2224       throw new IOException("Aborting flush because server is aborted...");
2225     }
2226     final long startTime = EnvironmentEdgeManager.currentTime();
2227     // If nothing to flush, return, but return with a valid unused sequenceId.
2228     // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a
2229     // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs
2230     // to no other that it can use to associate with the bulk load. Hence this little dance below
2231     // to go get one.
2232     if (this.memstoreSize.get() <= 0) {
2233       // Take an update lock so no edits can come into memory just yet.
2234       this.updatesLock.writeLock().lock();
2235       WriteEntry writeEntry = null;
2236       try {
2237         if (this.memstoreSize.get() <= 0) {
2238           // Presume that if there are still no edits in the memstore, then there are no edits for
2239           // this region out in the WAL subsystem so no need to do any trickery clearing out
2240           // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for
2241           // sure just beyond the last appended region edit and not associated with any edit
2242           // (useful as marker when bulk loading, etc.).
2243           FlushResult flushResult = null;
2244           if (wal != null) {
2245             writeEntry = mvcc.begin();
2246             long flushOpSeqId = writeEntry.getWriteNumber();
2247             flushResult = new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
2248               flushOpSeqId, "Nothing to flush",
2249             writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
2250             mvcc.completeAndWait(writeEntry);
2251             // Set to null so we don't complete it again down in finally block.
2252             writeEntry = null;
2253             return new PrepareFlushResult(flushResult, myseqid);
2254           } else {
2255             return new PrepareFlushResult(new FlushResultImpl(
2256               FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid);
2257           }
2258         }
2259       } finally {
2260         if (writeEntry != null) {
2261           // If writeEntry is non-null, this operation failed; the mvcc transaction failed...
2262           // but complete it anyways so it doesn't block the mvcc queue.
2263           mvcc.complete(writeEntry);
2264         }
2265         this.updatesLock.writeLock().unlock();
2266       }
2267     }
2268     logFatLineOnFlush(storesToFlush, myseqid);
2269     // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
2270     // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
2271     // allow updates again so its value will represent the size of the updates received
2272     // during flush
2273
2274     // We have to take an update lock during snapshot, or else a write could end up in both snapshot
2275     // and memstore (makes it difficult to do atomic rows then)
2276     status.setStatus("Obtaining lock to block concurrent updates");
2277     // block waiting for the lock for internal flush
2278     this.updatesLock.writeLock().lock();
2279     status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName());
2280     long totalFlushableSizeOfFlushableStores = 0;
2281
2282     Set<byte[]> flushedFamilyNames = new HashSet<byte[]>();
2283     for (Store store: storesToFlush) {
2284       flushedFamilyNames.add(store.getFamily().getName());
2285     }
2286
2287     TreeMap<byte[], StoreFlushContext> storeFlushCtxs
2288       = new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
2289     TreeMap<byte[], List<Path>> committedFiles = new TreeMap<byte[], List<Path>>(
2290         Bytes.BYTES_COMPARATOR);
2291     TreeMap<byte[], Long> storeFlushableSize
2292         = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
2293     // The sequence id of this flush operation which is used to log FlushMarker and pass to
2294     // createFlushContext to use as the store file's sequence id. It can be in advance of edits
2295     // still in the memstore, edits that are in other column families yet to be flushed.
2296     long flushOpSeqId = HConstants.NO_SEQNUM;
2297     // The max flushed sequence id after this flush operation completes. All edits in memstore
2298     // will be in advance of this sequence id.
2299     long flushedSeqId = HConstants.NO_SEQNUM;
2300     byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
2301     try {
2302       if (wal != null) {
2303         Long earliestUnflushedSequenceIdForTheRegion =
2304             wal.startCacheFlush(encodedRegionName, flushedFamilyNames);
2305         if (earliestUnflushedSequenceIdForTheRegion == null) {
2306           // This should never happen. This is how startCacheFlush signals flush cannot proceed.
2307           String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
2308           status.setStatus(msg);
2309           return new PrepareFlushResult(
2310               new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
2311               myseqid);
2312         }
2313         flushOpSeqId = getNextSequenceId(wal);
2314         // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit
2315         flushedSeqId =
2316             earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM?
2317                 flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
2318       } else {
2319         // use the provided sequence Id as WAL is not being used for this flush.
2320         flushedSeqId = flushOpSeqId = myseqid;
2321       }
2322
2323       for (Store s : storesToFlush) {
2324         totalFlushableSizeOfFlushableStores += s.getFlushableSize();
2325         storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId));
2326         committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL
2327         storeFlushableSize.put(s.getFamily().getName(), s.getFlushableSize());
2328       }
2329
2330       // write the snapshot start to WAL
2331       if (wal != null && !writestate.readOnly) {
2332         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
2333             getRegionInfo(), flushOpSeqId, committedFiles);
2334         // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH
2335         WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
2336             mvcc);
2337       }
2338
2339       // Prepare flush (take a snapshot)
2340       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2341         flush.prepare();
2342       }
2343     } catch (IOException ex) {
2344       doAbortFlushToWAL(wal, flushOpSeqId, committedFiles);
2345       throw ex;
2346     } finally {
2347       this.updatesLock.writeLock().unlock();
2348     }
2349     String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " +
2350         "flushsize=" + totalFlushableSizeOfFlushableStores;
2351     status.setStatus(s);
2352     doSyncOfUnflushedWALChanges(wal, getRegionInfo());
2353     return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
2354         flushOpSeqId, flushedSeqId, totalFlushableSizeOfFlushableStores);
2355   }
2356
2357   /**
2358    * Utility method broken out of internalPrepareFlushCache so that method is smaller.
2359    */
2360   private void logFatLineOnFlush(final Collection<Store> storesToFlush, final long sequenceId) {
2361     if (!LOG.isInfoEnabled()) {
2362       return;
2363     }
2364     // Log a fat line detailing what is being flushed.
2365     StringBuilder perCfExtras = null;
2366     if (!isAllFamilies(storesToFlush)) {
2367       perCfExtras = new StringBuilder();
2368       for (Store store: storesToFlush) {
2369         perCfExtras.append("; ").append(store.getColumnFamilyName());
2370         perCfExtras.append("=").append(StringUtils.byteDesc(store.getMemStoreSize()));
2371       }
2372     }
2373     LOG.info("Flushing " + + storesToFlush.size() + "/" + stores.size() +
2374         " column families, memstore=" + StringUtils.byteDesc(this.memstoreSize.get()) +
2375         ((perCfExtras != null && perCfExtras.length() > 0)? perCfExtras.toString(): "") +
2376         ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId));
2377   }
2378
2379   private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId,
2380       final Map<byte[], List<Path>> committedFiles) {
2381     if (wal == null) return;
2382     try {
2383       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2384           getRegionInfo(), flushOpSeqId, committedFiles);
2385       WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false,
2386           mvcc);
2387     } catch (Throwable t) {
2388       LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
2389           StringUtils.stringifyException(t));
2390       // ignore this since we will be aborting the RS with DSE.
2391     }
2392     // we have called wal.startCacheFlush(), now we have to abort it
2393     wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2394   }
2395
2396   /**
2397    * Sync unflushed WAL changes. See HBASE-8208 for details
2398    */
2399   private static void doSyncOfUnflushedWALChanges(final WAL wal, final HRegionInfo hri)
2400   throws IOException {
2401     if (wal == null) {
2402       return;
2403     }
2404     try {
2405       wal.sync(); // ensure that flush marker is sync'ed
2406     } catch (IOException ioe) {
2407       wal.abortCacheFlush(hri.getEncodedNameAsBytes());
2408       throw ioe;
2409     }
2410   }
2411
2412   /**
2413    * @return True if passed Set is all families in the region.
2414    */
2415   private boolean isAllFamilies(final Collection<Store> families) {
2416     return families == null || this.stores.size() == families.size();
2417   }
2418
2419   /**
2420    * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
2421    * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
2422    * @param wal
2423    * @return whether WAL write was successful
2424    */
2425   private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
2426     if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
2427       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
2428         getRegionInfo(), -1, new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR));
2429       try {
2430         WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
2431             mvcc);
2432         return true;
2433       } catch (IOException e) {
2434         LOG.warn(getRegionInfo().getEncodedName() + " : "
2435             + "Received exception while trying to write the flush request to wal", e);
2436       }
2437     }
2438     return false;
2439   }
2440
2441   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
2442       justification="Intentional; notify is about completed flush")
2443   protected FlushResult internalFlushCacheAndCommit(
2444         final WAL wal, MonitoredTask status, final PrepareFlushResult prepareResult,
2445         final Collection<Store> storesToFlush)
2446     throws IOException {
2447
2448     // prepare flush context is carried via PrepareFlushResult
2449     TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
2450     TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
2451     long startTime = prepareResult.startTime;
2452     long flushOpSeqId = prepareResult.flushOpSeqId;
2453     long flushedSeqId = prepareResult.flushedSeqId;
2454     long totalFlushableSizeOfFlushableStores = prepareResult.totalFlushableSize;
2455
2456     String s = "Flushing stores of " + this;
2457     status.setStatus(s);
2458     if (LOG.isTraceEnabled()) LOG.trace(s);
2459
2460     // Any failure from here on out will be catastrophic requiring server
2461     // restart so wal content can be replayed and put back into the memstore.
2462     // Otherwise, the snapshot content while backed up in the wal, it will not
2463     // be part of the current running servers state.
2464     boolean compactionRequested = false;
2465     long flushedOutputFileSize = 0;
2466     try {
2467       // A.  Flush memstore to all the HStores.
2468       // Keep running vector of all store files that includes both old and the
2469       // just-made new flush store file. The new flushed file is still in the
2470       // tmp directory.
2471
2472       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2473         flush.flushCache(status);
2474       }
2475
2476       // Switch snapshot (in memstore) -> new hfile (thus causing
2477       // all the store scanners to reset/reseek).
2478       Iterator<Store> it = storesToFlush.iterator();
2479       // stores.values() and storeFlushCtxs have same order
2480       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2481         boolean needsCompaction = flush.commit(status);
2482         if (needsCompaction) {
2483           compactionRequested = true;
2484         }
2485         byte[] storeName = it.next().getFamily().getName();
2486         List<Path> storeCommittedFiles = flush.getCommittedFiles();
2487         committedFiles.put(storeName, storeCommittedFiles);
2488         // Flush committed no files, indicating flush is empty or flush was canceled
2489         if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) {
2490           totalFlushableSizeOfFlushableStores -= prepareResult.storeFlushableSize.get(storeName);
2491         }
2492         flushedOutputFileSize += flush.getOutputFileSize();
2493       }
2494       storeFlushCtxs.clear();
2495
2496       // Set down the memstore size by amount of flush.
2497       this.addAndGetGlobalMemstoreSize(-totalFlushableSizeOfFlushableStores);
2498
2499       if (wal != null) {
2500         // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
2501         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
2502           getRegionInfo(), flushOpSeqId, committedFiles);
2503         WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true,
2504             mvcc);
2505       }
2506     } catch (Throwable t) {
2507       // An exception here means that the snapshot was not persisted.
2508       // The wal needs to be replayed so its content is restored to memstore.
2509       // Currently, only a server restart will do this.
2510       // We used to only catch IOEs but its possible that we'd get other
2511       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
2512       // all and sundry.
2513       if (wal != null) {
2514         try {
2515           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2516             getRegionInfo(), flushOpSeqId, committedFiles);
2517           WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc);
2518         } catch (Throwable ex) {
2519           LOG.warn(getRegionInfo().getEncodedName() + " : "
2520               + "failed writing ABORT_FLUSH marker to WAL", ex);
2521           // ignore this since we will be aborting the RS with DSE.
2522         }
2523         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2524       }
2525       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
2526           Bytes.toStringBinary(getRegionInfo().getRegionName()));
2527       dse.initCause(t);
2528       status.abort("Flush failed: " + StringUtils.stringifyException(t));
2529
2530       // Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
2531       // However, since we may have the region read lock, we cannot call close(true) here since
2532       // we cannot promote to a write lock. Instead we are setting closing so that all other region
2533       // operations except for close will be rejected.
2534       this.closing.set(true);
2535
2536       if (rsServices != null) {
2537         // This is a safeguard against the case where the caller fails to explicitly handle aborting
2538         rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
2539       }
2540
2541       throw dse;
2542     }
2543
2544     // If we get to here, the HStores have been written.
2545     for(Store storeToFlush :storesToFlush) {
2546       storeToFlush.finalizeFlush();
2547     }
2548     if (wal != null) {
2549       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2550     }
2551
2552     // Record latest flush time
2553     for (Store store: storesToFlush) {
2554       this.lastStoreFlushTimeMap.put(store, startTime);
2555     }
2556
2557     this.maxFlushedSeqId = flushedSeqId;
2558     this.lastFlushOpSeqId = flushOpSeqId;
2559
2560     // C. Finally notify anyone waiting on memstore to clear:
2561     // e.g. checkResources().
2562     synchronized (this) {
2563       notifyAll(); // FindBugs NN_NAKED_NOTIFY
2564     }
2565
2566     long time = EnvironmentEdgeManager.currentTime() - startTime;
2567     long memstoresize = this.memstoreSize.get();
2568     String msg = "Finished memstore flush of ~"
2569         + StringUtils.byteDesc(totalFlushableSizeOfFlushableStores) + "/"
2570         + totalFlushableSizeOfFlushableStores + ", currentsize="
2571         + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
2572         + " for region " + this + " in " + time + "ms, sequenceid="
2573         + flushOpSeqId +  ", compaction requested=" + compactionRequested
2574         + ((wal == null) ? "; wal=null" : "");
2575     LOG.info(msg);
2576     status.setStatus(msg);
2577
2578     if (rsServices != null && rsServices.getMetrics() != null) {
2579       rsServices.getMetrics().updateFlush(time - startTime,
2580         totalFlushableSizeOfFlushableStores, flushedOutputFileSize);
2581     }
2582
2583     return new FlushResultImpl(compactionRequested ?
2584         FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
2585           FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
2586   }
2587
2588   /**
2589    * Method to safely get the next sequence number.
2590    * @return Next sequence number unassociated with any actual edit.
2591    * @throws IOException
2592    */
2593   @VisibleForTesting
2594   protected long getNextSequenceId(final WAL wal) throws IOException {
2595     WriteEntry we = mvcc.begin();
2596     mvcc.completeAndWait(we);
2597     return we.getWriteNumber();
2598   }
2599
2600   //////////////////////////////////////////////////////////////////////////////
2601   // get() methods for client use.
2602   //////////////////////////////////////////////////////////////////////////////
2603
2604   @Override
2605   public RegionScanner getScanner(Scan scan) throws IOException {
2606    return getScanner(scan, null);
2607   }
2608
2609   @Override
2610   public RegionScanner getScanner(Scan scan, List<KeyValueScanner> additionalScanners)
2611       throws IOException {
2612     startRegionOperation(Operation.SCAN);
2613     try {
2614       // Verify families are all valid
2615       if (!scan.hasFamilies()) {
2616         // Adding all families to scanner
2617         for (byte[] family : this.htableDescriptor.getFamiliesKeys()) {
2618           scan.addFamily(family);
2619         }
2620       } else {
2621         for (byte[] family : scan.getFamilyMap().keySet()) {
2622           checkFamily(family);
2623         }
2624       }
2625       return instantiateRegionScanner(scan, additionalScanners);
2626     } finally {
2627       closeRegionOperation(Operation.SCAN);
2628     }
2629   }
2630
2631   protected RegionScanner instantiateRegionScanner(Scan scan,
2632       List<KeyValueScanner> additionalScanners) throws IOException {
2633     if (scan.isReversed()) {
2634       if (scan.getFilter() != null) {
2635         scan.getFilter().setReversed(true);
2636       }
2637       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
2638     }
2639     return new RegionScannerImpl(scan, additionalScanners, this);
2640   }
2641
2642   @Override
2643   public void prepareDelete(Delete delete) throws IOException {
2644     // Check to see if this is a deleteRow insert
2645     if(delete.getFamilyCellMap().isEmpty()){
2646       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
2647         // Don't eat the timestamp
2648         delete.addFamily(family, delete.getTimeStamp());
2649       }
2650     } else {
2651       for(byte [] family : delete.getFamilyCellMap().keySet()) {
2652         if(family == null) {
2653           throw new NoSuchColumnFamilyException("Empty family is invalid");
2654         }
2655         checkFamily(family);
2656       }
2657     }
2658   }
2659
2660   @Override
2661   public void delete(Delete delete) throws IOException {
2662     checkReadOnly();
2663     checkResources();
2664     startRegionOperation(Operation.DELETE);
2665     try {
2666       delete.getRow();
2667       // All edits for the given row (across all column families) must happen atomically.
2668       doBatchMutate(delete);
2669     } finally {
2670       closeRegionOperation(Operation.DELETE);
2671     }
2672   }
2673
2674   /**
2675    * Row needed by below method.
2676    */
2677   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2678
2679   /**
2680    * This is used only by unit tests. Not required to be a public API.
2681    * @param familyMap map of family to edits for the given family.
2682    * @throws IOException
2683    */
2684   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2685       Durability durability) throws IOException {
2686     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2687     delete.setFamilyCellMap(familyMap);
2688     delete.setDurability(durability);
2689     doBatchMutate(delete);
2690   }
2691
2692   @Override
2693   public void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2694       byte[] byteNow) throws IOException {
2695     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2696
2697       byte[] family = e.getKey();
2698       List<Cell> cells = e.getValue();
2699       assert cells instanceof RandomAccess;
2700
2701       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2702       int listSize = cells.size();
2703       for (int i=0; i < listSize; i++) {
2704         Cell cell = cells.get(i);
2705         //  Check if time is LATEST, change to time of most recent addition if so
2706         //  This is expensive.
2707         if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && CellUtil.isDeleteType(cell)) {
2708           byte[] qual = CellUtil.cloneQualifier(cell);
2709           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2710
2711           Integer count = kvCount.get(qual);
2712           if (count == null) {
2713             kvCount.put(qual, 1);
2714           } else {
2715             kvCount.put(qual, count + 1);
2716           }
2717           count = kvCount.get(qual);
2718
2719           Get get = new Get(CellUtil.cloneRow(cell));
2720           get.setMaxVersions(count);
2721           get.addColumn(family, qual);
2722           if (coprocessorHost != null) {
2723             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2724                 byteNow, get)) {
2725               updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2726             }
2727           } else {
2728             updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2729           }
2730         } else {
2731           CellUtil.updateLatestStamp(cell, byteNow, 0);
2732         }
2733       }
2734     }
2735   }
2736
2737   void updateDeleteLatestVersionTimeStamp(Cell cell, Get get, int count, byte[] byteNow)
2738       throws IOException {
2739     List<Cell> result = get(get, false);
2740
2741     if (result.size() < count) {
2742       // Nothing to delete
2743       CellUtil.updateLatestStamp(cell, byteNow, 0);
2744       return;
2745     }
2746     if (result.size() > count) {
2747       throw new RuntimeException("Unexpected size: " + result.size());
2748     }
2749     Cell getCell = result.get(count - 1);
2750     CellUtil.setTimestamp(cell, getCell.getTimestamp());
2751   }
2752
2753   @Override
2754   public void put(Put put) throws IOException {
2755     checkReadOnly();
2756
2757     // Do a rough check that we have resources to accept a write.  The check is
2758     // 'rough' in that between the resource check and the call to obtain a
2759     // read lock, resources may run out.  For now, the thought is that this
2760     // will be extremely rare; we'll deal with it when it happens.
2761     checkResources();
2762     startRegionOperation(Operation.PUT);
2763     try {
2764       // All edits for the given row (across all column families) must happen atomically.
2765       doBatchMutate(put);
2766     } finally {
2767       closeRegionOperation(Operation.PUT);
2768     }
2769   }
2770
2771   /**
2772    * Struct-like class that tracks the progress of a batch operation,
2773    * accumulating status codes and tracking the index at which processing
2774    * is proceeding.
2775    */
2776   private abstract static class BatchOperation<T> {
2777     T[] operations;
2778     int nextIndexToProcess = 0;
2779     OperationStatus[] retCodeDetails;
2780     WALEdit[] walEditsFromCoprocessors;
2781
2782     public BatchOperation(T[] operations) {
2783       this.operations = operations;
2784       this.retCodeDetails = new OperationStatus[operations.length];
2785       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2786       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2787     }
2788
2789     public abstract Mutation getMutation(int index);
2790     public abstract long getNonceGroup(int index);
2791     public abstract long getNonce(int index);
2792     /** This method is potentially expensive and should only be used for non-replay CP path. */
2793     public abstract Mutation[] getMutationsForCoprocs();
2794     public abstract boolean isInReplay();
2795     public abstract long getReplaySequenceId();
2796
2797     public boolean isDone() {
2798       return nextIndexToProcess == operations.length;
2799     }
2800   }
2801
2802   private static class MutationBatch extends BatchOperation<Mutation> {
2803     private long nonceGroup;
2804     private long nonce;
2805     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2806       super(operations);
2807       this.nonceGroup = nonceGroup;
2808       this.nonce = nonce;
2809     }
2810
2811     @Override
2812     public Mutation getMutation(int index) {
2813       return this.operations[index];
2814     }
2815
2816     @Override
2817     public long getNonceGroup(int index) {
2818       return nonceGroup;
2819     }
2820
2821     @Override
2822     public long getNonce(int index) {
2823       return nonce;
2824     }
2825
2826     @Override
2827     public Mutation[] getMutationsForCoprocs() {
2828       return this.operations;
2829     }
2830
2831     @Override
2832     public boolean isInReplay() {
2833       return false;
2834     }
2835
2836     @Override
2837     public long getReplaySequenceId() {
2838       return 0;
2839     }
2840   }
2841
2842   private static class ReplayBatch extends BatchOperation<MutationReplay> {
2843     private long replaySeqId = 0;
2844     public ReplayBatch(MutationReplay[] operations, long seqId) {
2845       super(operations);
2846       this.replaySeqId = seqId;
2847     }
2848
2849     @Override
2850     public Mutation getMutation(int index) {
2851       return this.operations[index].mutation;
2852     }
2853
2854     @Override
2855     public long getNonceGroup(int index) {
2856       return this.operations[index].nonceGroup;
2857     }
2858
2859     @Override
2860     public long getNonce(int index) {
2861       return this.operations[index].nonce;
2862     }
2863
2864     @Override
2865     public Mutation[] getMutationsForCoprocs() {
2866       assert false;
2867       throw new RuntimeException("Should not be called for replay batch");
2868     }
2869
2870     @Override
2871     public boolean isInReplay() {
2872       return true;
2873     }
2874
2875     @Override
2876     public long getReplaySequenceId() {
2877       return this.replaySeqId;
2878     }
2879   }
2880
2881   @Override
2882   public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce)
2883       throws IOException {
2884     // As it stands, this is used for 3 things
2885     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2886     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2887     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2888     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2889   }
2890
2891   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2892     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2893   }
2894
2895   @Override
2896   public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
2897       throws IOException {
2898     if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
2899         && replaySeqId < lastReplayedOpenRegionSeqId) {
2900       // if it is a secondary replica we should ignore these entries silently
2901       // since they are coming out of order
2902       if (LOG.isTraceEnabled()) {
2903         LOG.trace(getRegionInfo().getEncodedName() + " : "
2904           + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
2905           + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
2906         for (MutationReplay mut : mutations) {
2907           LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
2908         }
2909       }
2910
2911       OperationStatus[] statuses = new OperationStatus[mutations.length];
2912       for (int i = 0; i < statuses.length; i++) {
2913         statuses[i] = OperationStatus.SUCCESS;
2914       }
2915       return statuses;
2916     }
2917     return batchMutate(new ReplayBatch(mutations, replaySeqId));
2918   }
2919
2920   /**
2921    * Perform a batch of mutations.
2922    * It supports only Put and Delete mutations and will ignore other types passed.
2923    * @param batchOp contains the list of mutations
2924    * @return an array of OperationStatus which internally contains the
2925    *         OperationStatusCode and the exceptionMessage if any.
2926    * @throws IOException
2927    */
2928   OperationStatus[] batchMutate(BatchOperation<?> batchOp) throws IOException {
2929     boolean initialized = false;
2930     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
2931     startRegionOperation(op);
2932     try {
2933       while (!batchOp.isDone()) {
2934         if (!batchOp.isInReplay()) {
2935           checkReadOnly();
2936         }
2937         checkResources();
2938
2939         if (!initialized) {
2940           this.writeRequestsCount.add(batchOp.operations.length);
2941           if (!batchOp.isInReplay()) {
2942             doPreBatchMutateHook(batchOp);
2943           }
2944           initialized = true;
2945         }
2946         long addedSize = doMiniBatchMutate(batchOp);
2947         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
2948         requestFlushIfNeeded(newSize);
2949       }
2950     } finally {
2951       closeRegionOperation(op);
2952     }
2953     return batchOp.retCodeDetails;
2954   }
2955
2956   private void doPreBatchMutateHook(BatchOperation<?> batchOp)
2957       throws IOException {
2958     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2959     WALEdit walEdit = new WALEdit();
2960     if (coprocessorHost != null) {
2961       for (int i = 0 ; i < batchOp.operations.length; i++) {
2962         Mutation m = batchOp.getMutation(i);
2963         if (m instanceof Put) {
2964           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2965             // pre hook says skip this Put
2966             // mark as success and skip in doMiniBatchMutation
2967             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2968           }
2969         } else if (m instanceof Delete) {
2970           Delete curDel = (Delete) m;
2971           if (curDel.getFamilyCellMap().isEmpty()) {
2972             // handle deleting a row case
2973             prepareDelete(curDel);
2974           }
2975           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2976             // pre hook says skip this Delete
2977             // mark as success and skip in doMiniBatchMutation
2978             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2979           }
2980         } else {
2981           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2982           // mark the operation return code as failure so that it will not be considered in
2983           // the doMiniBatchMutation
2984           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2985               "Put/Delete mutations only supported in batchMutate() now");
2986         }
2987         if (!walEdit.isEmpty()) {
2988           batchOp.walEditsFromCoprocessors[i] = walEdit;
2989           walEdit = new WALEdit();
2990         }
2991       }
2992     }
2993   }
2994
2995   /**
2996    * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[], long, long)}
2997    * In here we also handle replay of edits on region recover.
2998    * @return Change in size brought about by applying <code>batchOp</code>
2999    */
3000   @SuppressWarnings("unchecked")
3001   // TODO: This needs a rewrite. Doesn't have to be this long. St.Ack 20160120
3002   private long doMiniBatchMutate(BatchOperation<?> batchOp) throws IOException {
3003     boolean replay = batchOp.isInReplay();
3004     // Variable to note if all Put items are for the same CF -- metrics related
3005     boolean putsCfSetConsistent = true;
3006     // Variable to note if all Delete items are for the same CF -- metrics related
3007     boolean deletesCfSetConsistent = true;
3008     // The set of columnFamilies first seen for Put.
3009     Set<byte[]> putsCfSet = null;
3010     // The set of columnFamilies first seen for Delete.
3011     Set<byte[]> deletesCfSet = null;
3012     long currentNonceGroup = HConstants.NO_NONCE;
3013     long currentNonce = HConstants.NO_NONCE;
3014     WALEdit walEdit = null;
3015     boolean locked = false;
3016     // reference family maps directly so coprocessors can mutate them if desired
3017     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
3018     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
3019     int firstIndex = batchOp.nextIndexToProcess;
3020     int lastIndexExclusive = firstIndex;
3021     boolean success = false;
3022     int noOfPuts = 0;
3023     int noOfDeletes = 0;
3024     WriteEntry writeEntry = null;
3025     int cellCount = 0;
3026     /** Keep track of the locks we hold so we can release them in finally clause */
3027     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
3028     try {
3029       // STEP 1. Try to acquire as many locks as we can, and ensure we acquire at least one.
3030       int numReadyToWrite = 0;
3031       long now = EnvironmentEdgeManager.currentTime();
3032       while (lastIndexExclusive < batchOp.operations.length) {
3033         if (checkBatchOp(batchOp, lastIndexExclusive, familyMaps, now)) {
3034           lastIndexExclusive++;
3035           continue;
3036         }
3037         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
3038         // If we haven't got any rows in our batch, we should block to get the next one.
3039         RowLock rowLock = null;
3040         try {
3041           rowLock = getRowLockInternal(mutation.getRow(), true);
3042         } catch (IOException ioe) {
3043           LOG.warn("Failed getting lock, row=" + Bytes.toStringBinary(mutation.getRow()), ioe);
3044         }
3045         if (rowLock == null) {
3046           // We failed to grab another lock
3047           break; // Stop acquiring more rows for this batch
3048         } else {
3049           acquiredRowLocks.add(rowLock);
3050         }
3051
3052         lastIndexExclusive++;
3053         numReadyToWrite++;
3054         if (replay) {
3055           for (List<Cell> cells : mutation.getFamilyCellMap().values()) {
3056             cellCount += cells.size();
3057           }
3058         }
3059         if (mutation instanceof Put) {
3060           // If Column Families stay consistent through out all of the
3061           // individual puts then metrics can be reported as a multiput across
3062           // column families in the first put.
3063           if (putsCfSet == null) {
3064             putsCfSet = mutation.getFamilyCellMap().keySet();
3065           } else {
3066             putsCfSetConsistent = putsCfSetConsistent
3067                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
3068           }
3069         } else {
3070           if (deletesCfSet == null) {
3071             deletesCfSet = mutation.getFamilyCellMap().keySet();
3072           } else {
3073             deletesCfSetConsistent = deletesCfSetConsistent
3074                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
3075           }
3076         }
3077       }
3078
3079       // We've now grabbed as many mutations off the list as we can
3080
3081       // STEP 2. Update any LATEST_TIMESTAMP timestamps
3082       // We should record the timestamp only after we have acquired the rowLock,
3083       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
3084       now = EnvironmentEdgeManager.currentTime();
3085       byte[] byteNow = Bytes.toBytes(now);
3086
3087       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
3088       if (numReadyToWrite <= 0) {
3089         return 0L;
3090       }
3091
3092       for (int i = firstIndex; !replay && i < lastIndexExclusive; i++) {
3093         // skip invalid
3094         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3095             != OperationStatusCode.NOT_RUN) {
3096           // lastIndexExclusive was incremented above.
3097           continue;
3098         }
3099
3100         Mutation mutation = batchOp.getMutation(i);
3101         if (mutation instanceof Put) {
3102           updateCellTimestamps(familyMaps[i].values(), byteNow);
3103           noOfPuts++;
3104         } else {
3105           prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
3106           noOfDeletes++;
3107         }
3108         rewriteCellTags(familyMaps[i], mutation);
3109         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3110         if (fromCP != null) {
3111           cellCount += fromCP.size();
3112         }
3113         for (List<Cell> cells : familyMaps[i].values()) {
3114           cellCount += cells.size();
3115         }
3116       }
3117       walEdit = new WALEdit(cellCount, replay);
3118       lock(this.updatesLock.readLock(), numReadyToWrite);
3119       locked = true;
3120
3121       // calling the pre CP hook for batch mutation
3122       if (!replay && coprocessorHost != null) {
3123         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3124           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3125           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3126         if (coprocessorHost.preBatchMutate(miniBatchOp)) {
3127           return 0L;
3128         }
3129       }
3130
3131       // STEP 3. Build WAL edit
3132       Durability durability = Durability.USE_DEFAULT;
3133       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3134         // Skip puts that were determined to be invalid during preprocessing
3135         if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) {
3136           continue;
3137         }
3138
3139         Mutation m = batchOp.getMutation(i);
3140         Durability tmpDur = getEffectiveDurability(m.getDurability());
3141         if (tmpDur.ordinal() > durability.ordinal()) {
3142           durability = tmpDur;
3143         }
3144         if (tmpDur == Durability.SKIP_WAL) {
3145           recordMutationWithoutWal(m.getFamilyCellMap());
3146           continue;
3147         }
3148
3149         long nonceGroup = batchOp.getNonceGroup(i);
3150         long nonce = batchOp.getNonce(i);
3151         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
3152         // Given how nonces are originally written, these should be contiguous.
3153         // They don't have to be, it will still work, just write more WALEdits than needed.
3154         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
3155           // Write what we have so far for nonces out to WAL
3156           appendCurrentNonces(m, replay, walEdit, now, currentNonceGroup, currentNonce);
3157           walEdit = new WALEdit(cellCount, replay);
3158           currentNonceGroup = nonceGroup;
3159           currentNonce = nonce;
3160         }
3161
3162         // Add WAL edits by CP
3163         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3164         if (fromCP != null) {
3165           for (Cell cell : fromCP.getCells()) {
3166             walEdit.add(cell);
3167           }
3168         }
3169         addFamilyMapToWALEdit(familyMaps[i], walEdit);
3170       }
3171
3172       // STEP 4. Append the final edit to WAL and sync.
3173       Mutation mutation = batchOp.getMutation(firstIndex);
3174       WALKey walKey = null;
3175       if (replay) {
3176         // use wal key from the original
3177         walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3178           this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3179           mutation.getClusterIds(), currentNonceGroup, currentNonce, mvcc);
3180         walKey.setOrigLogSeqNum(batchOp.getReplaySequenceId());
3181       }
3182       // Not sure what is going on here when replay is going on... does the below append get
3183       // called for replayed edits? Am afraid to change it without test.
3184       if (!walEdit.isEmpty()) {
3185         if (!replay) {
3186           // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3187           walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3188               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3189               mutation.getClusterIds(), currentNonceGroup, currentNonce, mvcc,
3190               this.getReplicationScope());
3191         }
3192         // TODO: Use the doAppend methods below... complicated by the replay stuff above.
3193         try {
3194           long txid = this.wal.append(this.getRegionInfo(), walKey,
3195               walEdit, true);
3196           if (txid != 0) sync(txid, durability);
3197           writeEntry = walKey.getWriteEntry();
3198         } catch (IOException ioe) {
3199           if (walKey != null) mvcc.complete(walKey.getWriteEntry());
3200           throw ioe;
3201         }
3202       }
3203       if (walKey == null) {
3204         // If no walKey, then skipping WAL or some such. Being an mvcc transaction so sequenceid.
3205         writeEntry = mvcc.begin();
3206       }
3207
3208       // STEP 5. Write back to memstore
3209       long addedSize = 0;
3210       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3211         if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) {
3212           continue;
3213         }
3214         addedSize += applyFamilyMapToMemstore(familyMaps[i], replay,
3215             replay? batchOp.getReplaySequenceId(): writeEntry.getWriteNumber());
3216       }
3217
3218       // STEP 6. Complete mvcc.
3219       if (replay) {
3220         this.mvcc.advanceTo(batchOp.getReplaySequenceId());
3221       } else if (writeEntry != null/*Can be null if in replay mode*/) {
3222         mvcc.completeAndWait(writeEntry);
3223         writeEntry = null;
3224       }
3225
3226       // STEP 7. Release row locks, etc.
3227       if (locked) {
3228         this.updatesLock.readLock().unlock();
3229         locked = false;
3230       }
3231       releaseRowLocks(acquiredRowLocks);
3232
3233       // calling the post CP hook for batch mutation
3234       if (!replay && coprocessorHost != null) {
3235         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3236           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3237           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3238         coprocessorHost.postBatchMutate(miniBatchOp);
3239       }
3240
3241       for (int i = firstIndex; i < lastIndexExclusive; i ++) {
3242         if (batchOp.retCodeDetails[i] == OperationStatus.NOT_RUN) {
3243           batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3244         }
3245       }
3246
3247       // STEP 8. Run coprocessor post hooks. This should be done after the wal is
3248       // synced so that the coprocessor contract is adhered to.
3249       if (!replay && coprocessorHost != null) {
3250         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3251           // only for successful puts
3252           if (batchOp.retCodeDetails[i].getOperationStatusCode()
3253               != OperationStatusCode.SUCCESS) {
3254             continue;
3255           }
3256           Mutation m = batchOp.getMutation(i);
3257           if (m instanceof Put) {
3258             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
3259           } else {
3260             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
3261           }
3262         }
3263       }
3264
3265       success = true;
3266       return addedSize;
3267     } finally {
3268       // Call complete rather than completeAndWait because we probably had error if walKey != null
3269       if (writeEntry != null) mvcc.complete(writeEntry);
3270       if (locked) {
3271         this.updatesLock.readLock().unlock();
3272       }
3273       releaseRowLocks(acquiredRowLocks);
3274
3275       // See if the column families were consistent through the whole thing.
3276       // if they were then keep them. If they were not then pass a null.
3277       // null will be treated as unknown.
3278       // Total time taken might be involving Puts and Deletes.
3279       // Split the time for puts and deletes based on the total number of Puts and Deletes.
3280
3281       if (noOfPuts > 0) {
3282         // There were some Puts in the batch.
3283         if (this.metricsRegion != null) {
3284           this.metricsRegion.updatePut();
3285         }
3286       }
3287       if (noOfDeletes > 0) {
3288         // There were some Deletes in the batch.
3289         if (this.metricsRegion != null) {
3290           this.metricsRegion.updateDelete();
3291         }
3292       }
3293       if (!success) {
3294         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3295           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
3296             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
3297           }
3298         }
3299       }
3300       if (coprocessorHost != null && !batchOp.isInReplay()) {
3301         // call the coprocessor hook to do any finalization steps
3302         // after the put is done
3303         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3304             new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3305                 batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex,
3306                 lastIndexExclusive);
3307         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
3308       }
3309
3310       batchOp.nextIndexToProcess = lastIndexExclusive;
3311     }
3312   }
3313
3314   private void appendCurrentNonces(final Mutation mutation, final boolean replay,
3315       final WALEdit walEdit, final long now, final long currentNonceGroup, final long currentNonce)
3316   throws IOException {
3317     if (walEdit.isEmpty()) return;
3318     if (!replay) throw new IOException("Multiple nonces per batch and not in replay");
3319     WALKey walKey = new WALKey(this.getRegionInfo().getEncodedNameAsBytes(),
3320         this.htableDescriptor.getTableName(), now, mutation.getClusterIds(),
3321         currentNonceGroup, currentNonce, mvcc, this.getReplicationScope());
3322     this.wal.append(this.getRegionInfo(), walKey, walEdit, true);
3323     // Complete the mvcc transaction started down in append else it will block others
3324     this.mvcc.complete(walKey.getWriteEntry());
3325   }
3326
3327   private boolean checkBatchOp(BatchOperation<?> batchOp, final int lastIndexExclusive,
3328       final Map<byte[], List<Cell>>[] familyMaps, final long now)
3329   throws IOException {
3330     boolean skip = false;
3331     // Skip anything that "ran" already
3332     if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
3333         != OperationStatusCode.NOT_RUN) {
3334       return true;
3335     }
3336     Mutation mutation = batchOp.getMutation(lastIndexExclusive);
3337     Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
3338     // store the family map reference to allow for mutations
3339     familyMaps[lastIndexExclusive] = familyMap;
3340
3341     try {
3342       if (mutation instanceof Put) {
3343         // Check the families in the put. If bad, skip this one.
3344         if (batchOp.isInReplay()) {
3345           removeNonExistentColumnFamilyForReplay(familyMap);
3346         } else {
3347           checkFamilies(familyMap.keySet());
3348         }
3349         checkTimestamps(mutation.getFamilyCellMap(), now);
3350       } else {
3351         prepareDelete((Delete)mutation);
3352       }
3353       checkRow(mutation.getRow(), "doMiniBatchMutation");
3354     } catch (NoSuchColumnFamilyException nscf) {
3355       LOG.warn("No such column family in batch mutation", nscf);
3356       batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3357           OperationStatusCode.BAD_FAMILY, nscf.getMessage());
3358       skip = true;
3359     } catch (FailedSanityCheckException fsce) {
3360       LOG.warn("Batch Mutation did not pass sanity check", fsce);
3361       batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3362           OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
3363       skip = true;
3364     } catch (WrongRegionException we) {
3365       LOG.warn("Batch mutation had a row that does not belong to this region", we);
3366       batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3367           OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
3368       skip = true;
3369     }
3370     return skip;
3371   }
3372
3373   /**
3374    * During replay, there could exist column families which are removed between region server
3375    * failure and replay
3376    */
3377   private void removeNonExistentColumnFamilyForReplay(final Map<byte[], List<Cell>> familyMap) {
3378     List<byte[]> nonExistentList = null;
3379     for (byte[] family : familyMap.keySet()) {
3380       if (!this.htableDescriptor.hasFamily(family)) {
3381         if (nonExistentList == null) {
3382           nonExistentList = new ArrayList<byte[]>();
3383         }
3384         nonExistentList.add(family);
3385       }
3386     }
3387     if (nonExistentList != null) {
3388       for (byte[] family : nonExistentList) {
3389         // Perhaps schema was changed between crash and replay
3390         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3391         familyMap.remove(family);
3392       }
3393     }
3394   }
3395
3396   /**
3397    * Returns effective durability from the passed durability and
3398    * the table descriptor.
3399    */
3400   protected Durability getEffectiveDurability(Durability d) {
3401     return d == Durability.USE_DEFAULT ? this.durability : d;
3402   }
3403
3404   @Override
3405   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
3406       CompareOp compareOp, ByteArrayComparable comparator, Mutation mutation,
3407       boolean writeToWAL)
3408   throws IOException{
3409     checkMutationType(mutation, row);
3410     return doCheckAndRowMutate(row, family, qualifier, compareOp, comparator, null,
3411       mutation, writeToWAL);
3412   }
3413
3414   @Override
3415   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3416       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
3417       boolean writeToWAL)
3418   throws IOException {
3419     return doCheckAndRowMutate(row, family, qualifier, compareOp, comparator, rm, null,
3420       writeToWAL);
3421   }
3422
3423   /**
3424    * checkAndMutate and checkAndRowMutate are 90% the same. Rather than copy/paste, below has
3425    * switches in the few places where there is deviation.
3426    */
3427   private boolean doCheckAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3428       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rowMutations,
3429       Mutation mutation, boolean writeToWAL)
3430   throws IOException {
3431     // Could do the below checks but seems wacky with two callers only. Just comment out for now.
3432     // One caller passes a Mutation, the other passes RowMutation. Presume all good so we don't
3433     // need these commented out checks.
3434     // if (rowMutations == null && mutation == null) throw new DoNotRetryIOException("Both null");
3435     // if (rowMutations != null && mutation != null) throw new DoNotRetryIOException("Both set");
3436     checkReadOnly();
3437     // TODO, add check for value length also move this check to the client
3438     checkResources();
3439     startRegionOperation();
3440     try {
3441       Get get = new Get(row);
3442       checkFamily(family);
3443       get.addColumn(family, qualifier);
3444       // Lock row - note that doBatchMutate will relock this row if called
3445       checkRow(row, "doCheckAndRowMutate");
3446       RowLock rowLock = getRowLockInternal(get.getRow(), false);
3447       try {
3448         if (mutation != null && this.getCoprocessorHost() != null) {
3449           // Call coprocessor.
3450           Boolean processed = null;
3451           if (mutation instanceof Put) {
3452             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
3453                 qualifier, compareOp, comparator, (Put)mutation);
3454           } else if (mutation instanceof Delete) {
3455             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
3456                 qualifier, compareOp, comparator, (Delete)mutation);
3457           }
3458           if (processed != null) {
3459             return processed;
3460           }
3461         }
3462         // NOTE: We used to wait here until mvcc caught up:  mvcc.await();
3463         // Supposition is that now all changes are done under row locks, then when we go to read,
3464         // we'll get the latest on this row.
3465         List<Cell> result = get(get, false);
3466         boolean valueIsNull = comparator.getValue() == null || comparator.getValue().length == 0;
3467         boolean matches = false;
3468         long cellTs = 0;
3469         if (result.size() == 0 && valueIsNull) {
3470           matches = true;
3471         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 && valueIsNull) {
3472           matches = true;
3473           cellTs = result.get(0).getTimestamp();
3474         } else if (result.size() == 1 && !valueIsNull) {
3475           Cell kv = result.get(0);
3476           cellTs = kv.getTimestamp();
3477           int compareResult = CellComparator.compareValue(kv, comparator);
3478           matches = matches(compareOp, compareResult);
3479         }
3480         // If matches put the new put or delete the new delete
3481         if (matches) {
3482           // We have acquired the row lock already. If the system clock is NOT monotonically
3483           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
3484           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
3485           // there is no way to pass the cellTs. See HBASE-14054.
3486           long now = EnvironmentEdgeManager.currentTime();
3487           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
3488           byte[] byteTs = Bytes.toBytes(ts);
3489           if (mutation != null) {
3490             if (mutation instanceof Put) {
3491               updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs);
3492             }
3493             // And else 'delete' is not needed since it already does a second get, and sets the
3494             // timestamp from get (see prepareDeleteTimestamps).
3495           } else {
3496             for (Mutation m: rowMutations.getMutations()) {
3497               if (m instanceof Put) {
3498                 updateCellTimestamps(m.getFamilyCellMap().values(), byteTs);
3499               }
3500             }
3501             // And else 'delete' is not needed since it already does a second get, and sets the
3502             // timestamp from get (see prepareDeleteTimestamps).
3503           }
3504           // All edits for the given row (across all column families) must happen atomically.
3505           if (mutation != null) {
3506             doBatchMutate(mutation);
3507           } else {
3508             mutateRow(rowMutations);
3509           }
3510           this.checkAndMutateChecksPassed.increment();
3511           return true;
3512         }
3513         this.checkAndMutateChecksFailed.increment();
3514         return false;
3515       } finally {
3516         rowLock.release();
3517       }
3518     } finally {
3519       closeRegionOperation();
3520     }
3521   }
3522
3523   private void checkMutationType(final Mutation mutation, final byte [] row)
3524   throws DoNotRetryIOException {
3525     boolean isPut = mutation instanceof Put;
3526     if (!isPut && !(mutation instanceof Delete)) {
3527       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must be Put or Delete");
3528     }
3529     if (!Bytes.equals(row, mutation.getRow())) {
3530       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match");
3531     }
3532   }
3533
3534   private boolean matches(final CompareOp compareOp, final int compareResult) {
3535     boolean matches = false;
3536     switch (compareOp) {
3537       case LESS:
3538         matches = compareResult < 0;
3539         break;
3540       case LESS_OR_EQUAL:
3541         matches = compareResult <= 0;
3542         break;
3543       case EQUAL:
3544         matches = compareResult == 0;
3545         break;
3546       case NOT_EQUAL:
3547         matches = compareResult != 0;
3548         break;
3549       case GREATER_OR_EQUAL:
3550         matches = compareResult >= 0;
3551         break;
3552       case GREATER:
3553         matches = compareResult > 0;
3554         break;
3555       default:
3556         throw new RuntimeException("Unknown Compare op " + compareOp.name());
3557     }
3558     return matches;
3559   }
3560
3561
3562   private void doBatchMutate(Mutation mutation) throws IOException {
3563     // Currently this is only called for puts and deletes, so no nonces.
3564     OperationStatus[] batchMutate = this.batchMutate(new Mutation[]{mutation});
3565     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
3566       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
3567     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
3568       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
3569     }
3570   }
3571
3572   /**
3573    * Complete taking the snapshot on the region. Writes the region info and adds references to the
3574    * working snapshot directory.
3575    *
3576    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
3577    * arg.  (In the future other cancellable HRegion methods could eventually add a
3578    * {@link ForeignExceptionSnare}, or we could do something fancier).
3579    *
3580    * @param desc snapshot description object
3581    * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
3582    *   bail out.  This is allowed to be null and will just be ignored in that case.
3583    * @throws IOException if there is an external or internal error causing the snapshot to fail
3584    */
3585   public void addRegionToSnapshot(SnapshotDescription desc,
3586       ForeignExceptionSnare exnSnare) throws IOException {
3587     Path rootDir = FSUtils.getRootDir(conf);
3588     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
3589
3590     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
3591             snapshotDir, desc, exnSnare);
3592     manifest.addRegion(this);
3593
3594     // The regionserver holding the first region of the table is responsible for taking the
3595     // manifest of the mob dir.
3596     if (!Bytes.equals(getRegionInfo().getStartKey(), HConstants.EMPTY_START_ROW))
3597       return;
3598
3599     // if any cf's have is mob enabled, add the "mob region" to the manifest.
3600     List<Store> stores = getStores();
3601     for (Store store : stores) {
3602       boolean hasMobStore = store.getFamily().isMobEnabled();
3603       if (hasMobStore) {
3604         // use the .mob as the start key and 0 as the regionid
3605         HRegionInfo mobRegionInfo = MobUtils.getMobRegionInfo(this.getTableDesc().getTableName());
3606         mobRegionInfo.setOffline(true);
3607         manifest.addMobRegion(mobRegionInfo, this.getTableDesc().getColumnFamilies());
3608         return;
3609       }
3610     }
3611   }
3612
3613   @Override
3614   public void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now)
3615       throws IOException {
3616     for (List<Cell> cells: cellItr) {
3617       if (cells == null) continue;
3618       assert cells instanceof RandomAccess;
3619       int listSize = cells.size();
3620       for (int i = 0; i < listSize; i++) {
3621         CellUtil.updateLatestStamp(cells.get(i), now, 0);
3622       }
3623     }
3624   }
3625
3626   /**
3627    * Possibly rewrite incoming cell tags.
3628    */
3629   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
3630     // Check if we have any work to do and early out otherwise
3631     // Update these checks as more logic is added here
3632     if (m.getTTL() == Long.MAX_VALUE) {
3633       return;
3634     }
3635
3636     // From this point we know we have some work to do
3637     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
3638       List<Cell> cells = e.getValue();
3639       assert cells instanceof RandomAccess;
3640       int listSize = cells.size();
3641       for (int i = 0; i < listSize; i++) {
3642         Cell cell = cells.get(i);
3643         List<Tag> newTags = TagUtil.carryForwardTags(null, cell);
3644         newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL());
3645         // Rewrite the cell with the updated set of tags
3646         cells.set(i, new TagRewriteCell(cell, TagUtil.fromList(newTags)));
3647       }
3648     }
3649   }
3650
3651   /*
3652    * Check if resources to support an update.
3653    *
3654    * We throw RegionTooBusyException if above memstore limit
3655    * and expect client to retry using some kind of backoff
3656   */
3657   private void checkResources() throws RegionTooBusyException {
3658     // If catalog region, do not impose resource constraints or block updates.
3659     if (this.getRegionInfo().isMetaRegion()) return;
3660
3661     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3662       blockedRequestsCount.increment();
3663       requestFlush();
3664       throw new RegionTooBusyException("Above memstore limit, " +
3665           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3666           this.getRegionInfo().getRegionNameAsString()) +
3667           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3668           this.getRegionServerServices().getServerName()) +
3669           ", memstoreSize=" + memstoreSize.get() +
3670           ", blockingMemStoreSize=" + blockingMemStoreSize);
3671     }
3672   }
3673
3674   /**
3675    * @throws IOException Throws exception if region is in read-only mode.
3676    */
3677   protected void checkReadOnly() throws IOException {
3678     if (isReadOnly()) {
3679       throw new DoNotRetryIOException("region is read only");
3680     }
3681   }
3682
3683   protected void checkReadsEnabled() throws IOException {
3684     if (!this.writestate.readsEnabled) {
3685       throw new IOException(getRegionInfo().getEncodedName()
3686         + ": The region's reads are disabled. Cannot serve the request");
3687     }
3688   }
3689
3690   public void setReadsEnabled(boolean readsEnabled) {
3691    if (readsEnabled && !this.writestate.readsEnabled) {
3692      LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
3693     }
3694     this.writestate.setReadsEnabled(readsEnabled);
3695   }
3696
3697   /**
3698    * Add updates first to the wal and then add values to memstore.
3699    * Warning: Assumption is caller has lock on passed in row.
3700    * @param edits Cell updates by column
3701    * @throws IOException
3702    */
3703   private void put(final byte [] row, byte [] family, List<Cell> edits)
3704   throws IOException {
3705     NavigableMap<byte[], List<Cell>> familyMap;
3706     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3707
3708     familyMap.put(family, edits);
3709     Put p = new Put(row);
3710     p.setFamilyCellMap(familyMap);
3711     doBatchMutate(p);
3712   }
3713
3714   /**
3715    * Atomically apply the given map of family->edits to the memstore.
3716    * This handles the consistency control on its own, but the caller
3717    * should already have locked updatesLock.readLock(). This also does
3718    * <b>not</b> check the families for validity.
3719    *
3720    * @param familyMap Map of Cells by family
3721    * @return the additional memory usage of the memstore caused by the new entries.
3722    */
3723   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap, boolean replay,
3724       long sequenceId)
3725   throws IOException {
3726     long size = 0;
3727     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3728       byte[] family = e.getKey();
3729       List<Cell> cells = e.getValue();
3730       assert cells instanceof RandomAccess;
3731       size += applyToMemstore(getStore(family), cells, false, replay, sequenceId);
3732     }
3733     return size;
3734   }
3735
3736   /**
3737    * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be
3738    *  set; when set we will run operations that make sense in the increment/append scenario but
3739    *  that do not make sense otherwise.
3740    * @return Memstore change in size on insert of these Cells.
3741    * @see #applyToMemstore(Store, Cell, long)
3742    */
3743   private long applyToMemstore(final Store store, final List<Cell> cells,
3744       final boolean delta, boolean replay, long sequenceId)
3745   throws IOException {
3746     // Any change in how we update Store/MemStore needs to also be done in other applyToMemstore!!!!
3747     long size = 0;
3748     boolean upsert = delta && store.getFamily().getMaxVersions() == 1;
3749     int count = cells.size();
3750     if (upsert) {
3751       size += store.upsert(cells, getSmallestReadPoint());
3752     } else {
3753       for (int i = 0; i < count; i++) {
3754         Cell cell = cells.get(i);
3755         // TODO: This looks wrong.. checking for sequenceid of zero is expensive!!!!! St.Ack
3756         // When is it zero anyways? When replay? Then just rely on that flag.
3757         if (cell.getSequenceId() == 0 || replay) {
3758           CellUtil.setSequenceId(cell, sequenceId);
3759         }
3760         size += store.add(cell);
3761       }
3762     }
3763     return size;
3764   }
3765
3766   /**
3767    * @return Memstore change in size on insert of these Cells.
3768    * @see #applyToMemstore(Store, List, boolean, boolean, long)
3769    */
3770   private long applyToMemstore(final Store store, final Cell cell, long sequenceId)
3771   throws IOException {
3772     // Any change in how we update Store/MemStore needs to also be done in other applyToMemstore!!!!
3773     if (store == null) {
3774       checkFamily(CellUtil.cloneFamily(cell));
3775       // Unreachable because checkFamily will throw exception
3776     }
3777     return store.add(cell);
3778   }
3779
3780   @Override
3781   public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException {
3782     for (byte[] family : families) {
3783       checkFamily(family);
3784     }
3785   }
3786
3787   @Override
3788   public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now)
3789       throws FailedSanityCheckException {
3790     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
3791       return;
3792     }
3793     long maxTs = now + timestampSlop;
3794     for (List<Cell> kvs : familyMap.values()) {
3795       assert kvs instanceof RandomAccess;
3796       int listSize  = kvs.size();
3797       for (int i=0; i < listSize; i++) {
3798         Cell cell = kvs.get(i);
3799         // see if the user-side TS is out of range. latest = server-side
3800         long ts = cell.getTimestamp();
3801         if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
3802           throw new FailedSanityCheckException("Timestamp for KV out of range "
3803               + cell + " (too.new=" + timestampSlop + ")");
3804         }
3805       }
3806     }
3807   }
3808
3809   /**
3810    * Append the given map of family->edits to a WALEdit data structure.
3811    * This does not write to the WAL itself.
3812    * @param familyMap map of family->edits
3813    * @param walEdit the destination entry to append into
3814    */
3815   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
3816       WALEdit walEdit) {
3817     for (List<Cell> edits : familyMap.values()) {
3818       assert edits instanceof RandomAccess;
3819       int listSize = edits.size();
3820       for (int i=0; i < listSize; i++) {
3821         Cell cell = edits.get(i);
3822         walEdit.add(cell);
3823       }
3824     }
3825   }
3826
3827   private void requestFlushIfNeeded(long memstoreTotalSize) throws RegionTooBusyException {
3828     if(memstoreTotalSize > this.getMemstoreFlushSize()) {
3829       requestFlush();
3830     }
3831   }
3832
3833   private void requestFlush() {
3834     if (this.rsServices == null) {
3835       return;
3836     }
3837     synchronized (writestate) {
3838       if (this.writestate.isFlushRequested()) {
3839         return;
3840       }
3841       writestate.flushRequested = true;
3842     }
3843     // Make request outside of synchronize block; HBASE-818.
3844     this.rsServices.getFlushRequester().requestFlush(this, false);
3845     if (LOG.isDebugEnabled()) {
3846       LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName());
3847     }
3848   }
3849
3850   /*
3851    * @param size
3852    * @return True if size is over the flush threshold
3853    */
3854   private boolean isFlushSize(final long size) {
3855     return size > this.memstoreFlushSize;
3856   }
3857
3858   /**
3859    * Read the edits put under this region by wal splitting process.  Put
3860    * the recovered edits back up into this region.
3861    *
3862    * <p>We can ignore any wal message that has a sequence ID that's equal to or
3863    * lower than minSeqId.  (Because we know such messages are already
3864    * reflected in the HFiles.)
3865    *
3866    * <p>While this is running we are putting pressure on memory yet we are
3867    * outside of our usual accounting because we are not yet an onlined region
3868    * (this stuff is being run as part of Region initialization).  This means
3869    * that if we're up against global memory limits, we'll not be flagged to flush
3870    * because we are not online. We can't be flushed by usual mechanisms anyways;
3871    * we're not yet online so our relative sequenceids are not yet aligned with
3872    * WAL sequenceids -- not till we come up online, post processing of split
3873    * edits.
3874    *
3875    * <p>But to help relieve memory pressure, at least manage our own heap size
3876    * flushing if are in excess of per-region limits.  Flushing, though, we have
3877    * to be careful and avoid using the regionserver/wal sequenceid.  Its running
3878    * on a different line to whats going on in here in this region context so if we
3879    * crashed replaying these edits, but in the midst had a flush that used the
3880    * regionserver wal with a sequenceid in excess of whats going on in here
3881    * in this region and with its split editlogs, then we could miss edits the
3882    * next time we go to recover. So, we have to flush inline, using seqids that
3883    * make sense in a this single region context only -- until we online.
3884    *
3885    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
3886    * the maxSeqId for the store to be applied, else its skipped.
3887    * @return the sequence id of the last edit added to this region out of the
3888    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3889    * @throws IOException
3890    */
3891   protected long replayRecoveredEditsIfAny(final Path regiondir,
3892       Map<byte[], Long> maxSeqIdInStores,
3893       final CancelableProgressable reporter, final MonitoredTask status)
3894       throws IOException {
3895     long minSeqIdForTheRegion = -1;
3896     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3897       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3898         minSeqIdForTheRegion = maxSeqIdInStore;
3899       }
3900     }
3901     long seqid = minSeqIdForTheRegion;
3902
3903     FileSystem fs = this.fs.getFileSystem();
3904     NavigableSet<Path> files = WALSplitter.getSplitEditFilesSorted(fs, regiondir);
3905     if (LOG.isDebugEnabled()) {
3906       LOG.debug("Found " + (files == null ? 0 : files.size())
3907         + " recovered edits file(s) under " + regiondir);
3908     }
3909
3910     if (files == null || files.isEmpty()) return seqid;
3911
3912     for (Path edits: files) {
3913       if (edits == null || !fs.exists(edits)) {
3914         LOG.warn("Null or non-existent edits file: " + edits);
3915         continue;
3916       }
3917       if (isZeroLengthThenDelete(fs, edits)) continue;
3918
3919       long maxSeqId;
3920       String fileName = edits.getName();
3921       maxSeqId = Math.abs(Long.parseLong(fileName));
3922       if (maxSeqId <= minSeqIdForTheRegion) {
3923         if (LOG.isDebugEnabled()) {
3924           String msg = "Maximum sequenceid for this wal is " + maxSeqId
3925             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3926             + ", skipped the whole file, path=" + edits;
3927           LOG.debug(msg);
3928         }
3929         continue;
3930       }
3931
3932       try {
3933         // replay the edits. Replay can return -1 if everything is skipped, only update
3934         // if seqId is greater
3935         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
3936       } catch (IOException e) {
3937         boolean skipErrors = conf.getBoolean(
3938             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3939             conf.getBoolean(
3940                 "hbase.skip.errors",
3941                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
3942         if (conf.get("hbase.skip.errors") != null) {
3943           LOG.warn(
3944               "The property 'hbase.skip.errors' has been deprecated. Please use " +
3945               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
3946         }
3947         if (skipErrors) {
3948           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
3949           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
3950               + "=true so continuing. Renamed " + edits +
3951               " as " + p, e);
3952         } else {
3953           throw e;
3954         }
3955       }
3956     }
3957     // The edits size added into rsAccounting during this replaying will not
3958     // be required any more. So just clear it.
3959     if (this.rsAccounting != null) {
3960       this.rsAccounting.clearRegionReplayEditsSize(getRegionInfo().getRegionName());
3961     }
3962     if (seqid > minSeqIdForTheRegion) {
3963       // Then we added some edits to memory. Flush and cleanup split edit files.
3964       internalFlushcache(null, seqid, stores.values(), status, false);
3965     }
3966     // Now delete the content of recovered edits.  We're done w/ them.
3967     if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
3968       // For debugging data loss issues!
3969       // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
3970       // column family. Have to fake out file type too by casting our recovered.edits as storefiles
3971       String fakeFamilyName = WALSplitter.getRegionDirRecoveredEditsDir(regiondir).getName();
3972       Set<StoreFile> fakeStoreFiles = new HashSet<StoreFile>(files.size());
3973       for (Path file: files) {
3974         fakeStoreFiles.add(new StoreFile(getRegionFileSystem().getFileSystem(), file, this.conf,
3975           null, null));
3976       }
3977       getRegionFileSystem().removeStoreFiles(fakeFamilyName, fakeStoreFiles);
3978     } else {
3979       for (Path file: files) {
3980         if (!fs.delete(file, false)) {
3981           LOG.error("Failed delete of " + file);
3982         } else {
3983           LOG.debug("Deleted recovered.edits file=" + file);
3984         }
3985       }
3986     }
3987     return seqid;
3988   }
3989
3990   /*
3991    * @param edits File of recovered edits.
3992    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in wal
3993    * must be larger than this to be replayed for each store.
3994    * @param reporter
3995    * @return the sequence id of the last edit added to this region out of the
3996    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3997    * @throws IOException
3998    */
3999   private long replayRecoveredEdits(final Path edits,
4000       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
4001     throws IOException {
4002     String msg = "Replaying edits from " + edits;
4003     LOG.info(msg);
4004     MonitoredTask status = TaskMonitor.get().createStatus(msg);
4005     FileSystem fs = this.fs.getFileSystem();
4006
4007     status.setStatus("Opening recovered edits");
4008     WAL.Reader reader = null;
4009     try {
4010       reader = WALFactory.createReader(fs, edits, conf);
4011       long currentEditSeqId = -1;
4012       long currentReplaySeqId = -1;
4013       long firstSeqIdInLog = -1;
4014       long skippedEdits = 0;
4015       long editsCount = 0;
4016       long intervalEdits = 0;
4017       WAL.Entry entry;
4018       Store store = null;
4019       boolean reported_once = false;
4020       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
4021
4022       try {
4023         // How many edits seen before we check elapsed time
4024         int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
4025         // How often to send a progress report (default 1/2 master timeout)
4026         int period = this.conf.getInt("hbase.hstore.report.period", 300000);
4027         long lastReport = EnvironmentEdgeManager.currentTime();
4028
4029         while ((entry = reader.next()) != null) {
4030           WALKey key = entry.getKey();
4031           WALEdit val = entry.getEdit();
4032
4033           if (ng != null) { // some test, or nonces disabled
4034             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
4035           }
4036
4037           if (reporter != null) {
4038             intervalEdits += val.size();
4039             if (intervalEdits >= interval) {
4040               // Number of edits interval reached
4041               intervalEdits = 0;
4042               long cur = EnvironmentEdgeManager.currentTime();
4043               if (lastReport + period <= cur) {
4044                 status.setStatus("Replaying edits..." +
4045                     " skipped=" + skippedEdits +
4046                     " edits=" + editsCount);
4047                 // Timeout reached
4048                 if(!reporter.progress()) {
4049                   msg = "Progressable reporter failed, stopping replay";
4050                   LOG.warn(msg);
4051                   status.abort(msg);
4052                   throw new IOException(msg);
4053                 }
4054                 reported_once = true;
4055                 lastReport = cur;
4056               }
4057             }
4058           }
4059
4060           if (firstSeqIdInLog == -1) {
4061             firstSeqIdInLog = key.getLogSeqNum();
4062           }
4063           if (currentEditSeqId > key.getLogSeqNum()) {
4064             // when this condition is true, it means we have a serious defect because we need to
4065             // maintain increasing SeqId for WAL edits per region
4066             LOG.error(getRegionInfo().getEncodedName() + " : "
4067                  + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
4068                 + "; edit=" + val);
4069           } else {
4070             currentEditSeqId = key.getLogSeqNum();
4071           }
4072           currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
4073             key.getOrigLogSeqNum() : currentEditSeqId;
4074
4075           // Start coprocessor replay here. The coprocessor is for each WALEdit
4076           // instead of a KeyValue.
4077           if (coprocessorHost != null) {
4078             status.setStatus("Running pre-WAL-restore hook in coprocessors");
4079             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
4080               // if bypass this wal entry, ignore it ...
4081               continue;
4082             }
4083           }
4084           boolean checkRowWithinBoundary = false;
4085           // Check this edit is for this region.
4086           if (!Bytes.equals(key.getEncodedRegionName(),
4087               this.getRegionInfo().getEncodedNameAsBytes())) {
4088             checkRowWithinBoundary = true;
4089           }
4090
4091           boolean flush = false;
4092           for (Cell cell: val.getCells()) {
4093             // Check this edit is for me. Also, guard against writing the special
4094             // METACOLUMN info such as HBASE::CACHEFLUSH entries
4095             if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
4096               // if region names don't match, skipp replaying compaction marker
4097               if (!checkRowWithinBoundary) {
4098                 //this is a special edit, we should handle it
4099                 CompactionDescriptor compaction = WALEdit.getCompaction(cell);
4100                 if (compaction != null) {
4101                   //replay the compaction
4102                   replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
4103                 }
4104               }
4105               skippedEdits++;
4106               continue;
4107             }
4108             // Figure which store the edit is meant for.
4109             if (store == null || !CellUtil.matchingFamily(cell, store.getFamily().getName())) {
4110               store = getStore(cell);
4111             }
4112             if (store == null) {
4113               // This should never happen.  Perhaps schema was changed between
4114               // crash and redeploy?
4115               LOG.warn("No family for " + cell);
4116               skippedEdits++;
4117               continue;
4118             }
4119             if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(),
4120               cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())) {
4121               LOG.warn("Row of " + cell + " is not within region boundary");
4122               skippedEdits++;
4123               continue;
4124             }
4125             // Now, figure if we should skip this edit.
4126             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
4127                 .getName())) {
4128               skippedEdits++;
4129               continue;
4130             }
4131             CellUtil.setSequenceId(cell, currentReplaySeqId);
4132
4133             // Once we are over the limit, restoreEdit will keep returning true to
4134             // flush -- but don't flush until we've played all the kvs that make up
4135             // the WALEdit.
4136             flush |= restoreEdit(store, cell);
4137             editsCount++;
4138           }
4139           if (flush) {
4140             internalFlushcache(null, currentEditSeqId, stores.values(), status, false);
4141           }
4142
4143           if (coprocessorHost != null) {
4144             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
4145           }
4146         }
4147       } catch (EOFException eof) {
4148         Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4149         msg = "Encountered EOF. Most likely due to Master failure during " +
4150             "wal splitting, so we have this data in another edit.  " +
4151             "Continuing, but renaming " + edits + " as " + p;
4152         LOG.warn(msg, eof);
4153         status.abort(msg);
4154       } catch (IOException ioe) {
4155         // If the IOE resulted from bad file format,
4156         // then this problem is idempotent and retrying won't help
4157         if (ioe.getCause() instanceof ParseException) {
4158           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4159           msg = "File corruption encountered!  " +
4160               "Continuing, but renaming " + edits + " as " + p;
4161           LOG.warn(msg, ioe);
4162           status.setStatus(msg);
4163         } else {
4164           status.abort(StringUtils.stringifyException(ioe));
4165           // other IO errors may be transient (bad network connection,
4166           // checksum exception on one datanode, etc).  throw & retry
4167           throw ioe;
4168         }
4169       }
4170       if (reporter != null && !reported_once) {
4171         reporter.progress();
4172       }
4173       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
4174         ", firstSequenceIdInLog=" + firstSeqIdInLog +
4175         ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
4176       status.markComplete(msg);
4177       LOG.debug(msg);
4178       return currentEditSeqId;
4179     } finally {
4180       status.cleanup();
4181       if (reader != null) {
4182          reader.close();
4183       }
4184     }
4185   }
4186
4187   /**
4188    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
4189    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
4190    * See HBASE-2331.
4191    */
4192   void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
4193       boolean removeFiles, long replaySeqId)
4194       throws IOException {
4195     try {
4196       checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
4197         "Compaction marker from WAL ", compaction);
4198     } catch (WrongRegionException wre) {
4199       if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4200         // skip the compaction marker since it is not for this region
4201         return;
4202       }
4203       throw wre;
4204     }
4205
4206     synchronized (writestate) {
4207       if (replaySeqId < lastReplayedOpenRegionSeqId) {
4208         LOG.warn(getRegionInfo().getEncodedName() + " : "
4209             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4210             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4211             + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4212         return;
4213       }
4214       if (replaySeqId < lastReplayedCompactionSeqId) {
4215         LOG.warn(getRegionInfo().getEncodedName() + " : "
4216             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4217             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4218             + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
4219         return;
4220       } else {
4221         lastReplayedCompactionSeqId = replaySeqId;
4222       }
4223
4224       if (LOG.isDebugEnabled()) {
4225         LOG.debug(getRegionInfo().getEncodedName() + " : "
4226             + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
4227             + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
4228             + lastReplayedOpenRegionSeqId);
4229       }
4230
4231       startRegionOperation(Operation.REPLAY_EVENT);
4232       try {
4233         Store store = this.getStore(compaction.getFamilyName().toByteArray());
4234         if (store == null) {
4235           LOG.warn(getRegionInfo().getEncodedName() + " : "
4236               + "Found Compaction WAL edit for deleted family:"
4237               + Bytes.toString(compaction.getFamilyName().toByteArray()));
4238           return;
4239         }
4240         store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
4241         logRegionFiles();
4242       } catch (FileNotFoundException ex) {
4243         LOG.warn(getRegionInfo().getEncodedName() + " : "
4244             + "At least one of the store files in compaction: "
4245             + TextFormat.shortDebugString(compaction)
4246             + " doesn't exist any more. Skip loading the file(s)", ex);
4247       } finally {
4248         closeRegionOperation(Operation.REPLAY_EVENT);
4249       }
4250     }
4251   }
4252
4253   void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
4254     checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
4255       "Flush marker from WAL ", flush);
4256
4257     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4258       return; // if primary nothing to do
4259     }
4260
4261     if (LOG.isDebugEnabled()) {
4262       LOG.debug(getRegionInfo().getEncodedName() + " : "
4263           + "Replaying flush marker " + TextFormat.shortDebugString(flush));
4264     }
4265
4266     startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
4267     try {
4268       FlushAction action = flush.getAction();
4269       switch (action) {
4270       case START_FLUSH:
4271         replayWALFlushStartMarker(flush);
4272         break;
4273       case COMMIT_FLUSH:
4274         replayWALFlushCommitMarker(flush);
4275         break;
4276       case ABORT_FLUSH:
4277         replayWALFlushAbortMarker(flush);
4278         break;
4279       case CANNOT_FLUSH:
4280         replayWALFlushCannotFlushMarker(flush, replaySeqId);
4281         break;
4282       default:
4283         LOG.warn(getRegionInfo().getEncodedName() + " : " +
4284           "Received a flush event with unknown action, ignoring. " +
4285           TextFormat.shortDebugString(flush));
4286         break;
4287       }
4288
4289       logRegionFiles();
4290     } finally {
4291       closeRegionOperation(Operation.REPLAY_EVENT);
4292     }
4293   }
4294
4295   /** Replay the flush marker from primary region by creating a corresponding snapshot of
4296    * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
4297    * edit (because the events may be coming out of order).
4298    */
4299   @VisibleForTesting
4300   PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
4301     long flushSeqId = flush.getFlushSequenceNumber();
4302
4303     HashSet<Store> storesToFlush = new HashSet<Store>();
4304     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4305       byte[] family = storeFlush.getFamilyName().toByteArray();
4306       Store store = getStore(family);
4307       if (store == null) {
4308         LOG.warn(getRegionInfo().getEncodedName() + " : "
4309           + "Received a flush start marker from primary, but the family is not found. Ignoring"
4310           + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
4311         continue;
4312       }
4313       storesToFlush.add(store);
4314     }
4315
4316     MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
4317
4318     // we will use writestate as a coarse-grain lock for all the replay events
4319     // (flush, compaction, region open etc)
4320     synchronized (writestate) {
4321       try {
4322         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4323           LOG.warn(getRegionInfo().getEncodedName() + " : "
4324               + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4325               + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4326               + " of " + lastReplayedOpenRegionSeqId);
4327           return null;
4328         }
4329         if (numMutationsWithoutWAL.get() > 0) {
4330           numMutationsWithoutWAL.set(0);
4331           dataInMemoryWithoutWAL.set(0);
4332         }
4333
4334         if (!writestate.flushing) {
4335           // we do not have an active snapshot and corresponding this.prepareResult. This means
4336           // we can just snapshot our memstores and continue as normal.
4337
4338           // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
4339           PrepareFlushResult prepareResult = internalPrepareFlushCache(null,
4340             flushSeqId, storesToFlush, status, false);
4341           if (prepareResult.result == null) {
4342             // save the PrepareFlushResult so that we can use it later from commit flush
4343             this.writestate.flushing = true;
4344             this.prepareFlushResult = prepareResult;
4345             status.markComplete("Flush prepare successful");
4346             if (LOG.isDebugEnabled()) {
4347               LOG.debug(getRegionInfo().getEncodedName() + " : "
4348                   + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
4349             }
4350           } else {
4351             // special case empty memstore. We will still save the flush result in this case, since
4352             // our memstore ie empty, but the primary is still flushing
4353             if (prepareResult.getResult().getResult() ==
4354                   FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
4355               this.writestate.flushing = true;
4356               this.prepareFlushResult = prepareResult;
4357               if (LOG.isDebugEnabled()) {
4358                 LOG.debug(getRegionInfo().getEncodedName() + " : "
4359                   + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
4360               }
4361             }
4362             status.abort("Flush prepare failed with " + prepareResult.result);
4363             // nothing much to do. prepare flush failed because of some reason.
4364           }
4365           return prepareResult;
4366         } else {
4367           // we already have an active snapshot.
4368           if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
4369             // They define the same flush. Log and continue.
4370             LOG.warn(getRegionInfo().getEncodedName() + " : "
4371                 + "Received a flush prepare marker with the same seqId: " +
4372                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4373                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4374             // ignore
4375           } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
4376             // We received a flush with a smaller seqNum than what we have prepared. We can only
4377             // ignore this prepare flush request.
4378             LOG.warn(getRegionInfo().getEncodedName() + " : "
4379                 + "Received a flush prepare marker with a smaller seqId: " +
4380                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4381                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4382             // ignore
4383           } else {
4384             // We received a flush with a larger seqNum than what we have prepared
4385             LOG.warn(getRegionInfo().getEncodedName() + " : "
4386                 + "Received a flush prepare marker with a larger seqId: " +
4387                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4388                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4389             // We do not have multiple active snapshots in the memstore or a way to merge current
4390             // memstore snapshot with the contents and resnapshot for now. We cannot take
4391             // another snapshot and drop the previous one because that will cause temporary
4392             // data loss in the secondary. So we ignore this for now, deferring the resolution
4393             // to happen when we see the corresponding flush commit marker. If we have a memstore
4394             // snapshot with x, and later received another prepare snapshot with y (where x < y),
4395             // when we see flush commit for y, we will drop snapshot for x, and can also drop all
4396             // the memstore edits if everything in memstore is < y. This is the usual case for
4397             // RS crash + recovery where we might see consequtive prepare flush wal markers.
4398             // Otherwise, this will cause more memory to be used in secondary replica until a
4399             // further prapare + commit flush is seen and replayed.
4400           }
4401         }
4402       } finally {
4403         status.cleanup();
4404         writestate.notifyAll();
4405       }
4406     }
4407     return null;
4408   }
4409
4410   @VisibleForTesting
4411   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4412     justification="Intentional; post memstore flush")
4413   void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
4414     MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
4415
4416     // check whether we have the memstore snapshot with the corresponding seqId. Replay to
4417     // secondary region replicas are in order, except for when the region moves or then the
4418     // region server crashes. In those cases, we may receive replay requests out of order from
4419     // the original seqIds.
4420     synchronized (writestate) {
4421       try {
4422         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4423           LOG.warn(getRegionInfo().getEncodedName() + " : "
4424             + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4425             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4426             + " of " + lastReplayedOpenRegionSeqId);
4427           return;
4428         }
4429
4430         if (writestate.flushing) {
4431           PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
4432           if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
4433             if (LOG.isDebugEnabled()) {
4434               LOG.debug(getRegionInfo().getEncodedName() + " : "
4435                   + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4436                   + " and a previous prepared snapshot was found");
4437             }
4438             // This is the regular case where we received commit flush after prepare flush
4439             // corresponding to the same seqId.
4440             replayFlushInStores(flush, prepareFlushResult, true);
4441
4442             // Set down the memstore size by amount of flush.
4443             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4444
4445             this.prepareFlushResult = null;
4446             writestate.flushing = false;
4447           } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
4448             // This should not happen normally. However, lets be safe and guard against these cases
4449             // we received a flush commit with a smaller seqId than what we have prepared
4450             // we will pick the flush file up from this commit (if we have not seen it), but we
4451             // will not drop the memstore
4452             LOG.warn(getRegionInfo().getEncodedName() + " : "
4453                 + "Received a flush commit marker with smaller seqId: "
4454                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
4455                 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
4456                 +"  prepared memstore snapshot");
4457             replayFlushInStores(flush, prepareFlushResult, false);
4458
4459             // snapshot is not dropped, so memstore sizes should not be decremented
4460             // we still have the prepared snapshot, flushing should still be true
4461           } else {
4462             // This should not happen normally. However, lets be safe and guard against these cases
4463             // we received a flush commit with a larger seqId than what we have prepared
4464             // we will pick the flush file for this. We will also obtain the updates lock and
4465             // look for contents of the memstore to see whether we have edits after this seqId.
4466             // If not, we will drop all the memstore edits and the snapshot as well.
4467             LOG.warn(getRegionInfo().getEncodedName() + " : "
4468                 + "Received a flush commit marker with larger seqId: "
4469                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
4470                 prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
4471                 +" memstore snapshot");
4472
4473             replayFlushInStores(flush, prepareFlushResult, true);
4474
4475             // Set down the memstore size by amount of flush.
4476             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4477
4478             // Inspect the memstore contents to see whether the memstore contains only edits
4479             // with seqId smaller than the flush seqId. If so, we can discard those edits.
4480             dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4481
4482             this.prepareFlushResult = null;
4483             writestate.flushing = false;
4484           }
4485           // If we were waiting for observing a flush or region opening event for not showing
4486           // partial data after a secondary region crash, we can allow reads now. We can only make
4487           // sure that we are not showing partial data (for example skipping some previous edits)
4488           // until we observe a full flush start and flush commit. So if we were not able to find
4489           // a previous flush we will not enable reads now.
4490           this.setReadsEnabled(true);
4491         } else {
4492           LOG.warn(getRegionInfo().getEncodedName() + " : "
4493               + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4494               + ", but no previous prepared snapshot was found");
4495           // There is no corresponding prepare snapshot from before.
4496           // We will pick up the new flushed file
4497           replayFlushInStores(flush, null, false);
4498
4499           // Inspect the memstore contents to see whether the memstore contains only edits
4500           // with seqId smaller than the flush seqId. If so, we can discard those edits.
4501           dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4502         }
4503
4504         status.markComplete("Flush commit successful");
4505
4506         // Update the last flushed sequence id for region.
4507         this.maxFlushedSeqId = flush.getFlushSequenceNumber();
4508
4509         // advance the mvcc read point so that the new flushed file is visible.
4510         mvcc.advanceTo(flush.getFlushSequenceNumber());
4511
4512       } catch (FileNotFoundException ex) {
4513         LOG.warn(getRegionInfo().getEncodedName() + " : "
4514             + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
4515             + " doesn't exist any more. Skip loading the file(s)", ex);
4516       }
4517       finally {
4518         status.cleanup();
4519         writestate.notifyAll();
4520       }
4521     }
4522
4523     // C. Finally notify anyone waiting on memstore to clear:
4524     // e.g. checkResources().
4525     synchronized (this) {
4526       notifyAll(); // FindBugs NN_NAKED_NOTIFY
4527     }
4528   }
4529
4530   /**
4531    * Replays the given flush descriptor by opening the flush files in stores and dropping the
4532    * memstore snapshots if requested.
4533    * @param flush
4534    * @param prepareFlushResult
4535    * @param dropMemstoreSnapshot
4536    * @throws IOException
4537    */
4538   private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
4539       boolean dropMemstoreSnapshot)
4540       throws IOException {
4541     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4542       byte[] family = storeFlush.getFamilyName().toByteArray();
4543       Store store = getStore(family);
4544       if (store == null) {
4545         LOG.warn(getRegionInfo().getEncodedName() + " : "
4546             + "Received a flush commit marker from primary, but the family is not found."
4547             + "Ignoring StoreFlushDescriptor:" + storeFlush);
4548         continue;
4549       }
4550       List<String> flushFiles = storeFlush.getFlushOutputList();
4551       StoreFlushContext ctx = null;
4552       long startTime = EnvironmentEdgeManager.currentTime();
4553       if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
4554         ctx = store.createFlushContext(flush.getFlushSequenceNumber());
4555       } else {
4556         ctx = prepareFlushResult.storeFlushCtxs.get(family);
4557         startTime = prepareFlushResult.startTime;
4558       }
4559
4560       if (ctx == null) {
4561         LOG.warn(getRegionInfo().getEncodedName() + " : "
4562             + "Unexpected: flush commit marker received from store "
4563             + Bytes.toString(family) + " but no associated flush context. Ignoring");
4564         continue;
4565       }
4566
4567       ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
4568
4569       // Record latest flush time
4570       this.lastStoreFlushTimeMap.put(store, startTime);
4571     }
4572   }
4573
4574   /**
4575    * Drops the memstore contents after replaying a flush descriptor or region open event replay
4576    * if the memstore edits have seqNums smaller than the given seq id
4577    * @throws IOException
4578    */
4579   private long dropMemstoreContentsForSeqId(long seqId, Store store) throws IOException {
4580     long totalFreedSize = 0;
4581     this.updatesLock.writeLock().lock();
4582     try {
4583
4584       long currentSeqId = mvcc.getReadPoint();
4585       if (seqId >= currentSeqId) {
4586         // then we can drop the memstore contents since everything is below this seqId
4587         LOG.info(getRegionInfo().getEncodedName() + " : "
4588             + "Dropping memstore contents as well since replayed flush seqId: "
4589             + seqId + " is greater than current seqId:" + currentSeqId);
4590
4591         // Prepare flush (take a snapshot) and then abort (drop the snapshot)
4592         if (store == null) {
4593           for (Store s : stores.values()) {
4594             totalFreedSize += doDropStoreMemstoreContentsForSeqId(s, currentSeqId);
4595           }
4596         } else {
4597           totalFreedSize += doDropStoreMemstoreContentsForSeqId(store, currentSeqId);
4598         }
4599       } else {
4600         LOG.info(getRegionInfo().getEncodedName() + " : "
4601             + "Not dropping memstore contents since replayed flush seqId: "
4602             + seqId + " is smaller than current seqId:" + currentSeqId);
4603       }
4604     } finally {
4605       this.updatesLock.writeLock().unlock();
4606     }
4607     return totalFreedSize;
4608   }
4609
4610   private long doDropStoreMemstoreContentsForSeqId(Store s, long currentSeqId) throws IOException {
4611     long snapshotSize = s.getFlushableSize();
4612     this.addAndGetGlobalMemstoreSize(-snapshotSize);
4613     StoreFlushContext ctx = s.createFlushContext(currentSeqId);
4614     ctx.prepare();
4615     ctx.abort();
4616     return snapshotSize;
4617   }
4618
4619   private void replayWALFlushAbortMarker(FlushDescriptor flush) {
4620     // nothing to do for now. A flush abort will cause a RS abort which means that the region
4621     // will be opened somewhere else later. We will see the region open event soon, and replaying
4622     // that will drop the snapshot
4623   }
4624
4625   private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
4626     synchronized (writestate) {
4627       if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
4628         LOG.warn(getRegionInfo().getEncodedName() + " : "
4629           + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4630           + " because its sequence id " + replaySeqId + " is smaller than this regions "
4631           + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4632         return;
4633       }
4634
4635       // If we were waiting for observing a flush or region opening event for not showing partial
4636       // data after a secondary region crash, we can allow reads now. This event means that the
4637       // primary was not able to flush because memstore is empty when we requested flush. By the
4638       // time we observe this, we are guaranteed to have up to date seqId with our previous
4639       // assignment.
4640       this.setReadsEnabled(true);
4641     }
4642   }
4643
4644   @VisibleForTesting
4645   PrepareFlushResult getPrepareFlushResult() {
4646     return prepareFlushResult;
4647   }
4648
4649   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4650       justification="Intentional; cleared the memstore")
4651   void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
4652     checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
4653       "RegionEvent marker from WAL ", regionEvent);
4654
4655     startRegionOperation(Operation.REPLAY_EVENT);
4656     try {
4657       if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4658         return; // if primary nothing to do
4659       }
4660
4661       if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
4662         // nothing to do on REGION_CLOSE for now.
4663         return;
4664       }
4665       if (regionEvent.getEventType() != EventType.REGION_OPEN) {
4666         LOG.warn(getRegionInfo().getEncodedName() + " : "
4667             + "Unknown region event received, ignoring :"
4668             + TextFormat.shortDebugString(regionEvent));
4669         return;
4670       }
4671
4672       if (LOG.isDebugEnabled()) {
4673         LOG.debug(getRegionInfo().getEncodedName() + " : "
4674           + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
4675       }
4676
4677       // we will use writestate as a coarse-grain lock for all the replay events
4678       synchronized (writestate) {
4679         // Replication can deliver events out of order when primary region moves or the region
4680         // server crashes, since there is no coordination between replication of different wal files
4681         // belonging to different region servers. We have to safe guard against this case by using
4682         // region open event's seqid. Since this is the first event that the region puts (after
4683         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4684         // smaller than this seqId
4685         if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
4686           this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
4687         } else {
4688           LOG.warn(getRegionInfo().getEncodedName() + " : "
4689             + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
4690             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4691             + " of " + lastReplayedOpenRegionSeqId);
4692           return;
4693         }
4694
4695         // region open lists all the files that the region has at the time of the opening. Just pick
4696         // all the files and drop prepared flushes and empty memstores
4697         for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
4698           // stores of primary may be different now
4699           byte[] family = storeDescriptor.getFamilyName().toByteArray();
4700           Store store = getStore(family);
4701           if (store == null) {
4702             LOG.warn(getRegionInfo().getEncodedName() + " : "
4703                 + "Received a region open marker from primary, but the family is not found. "
4704                 + "Ignoring. StoreDescriptor:" + storeDescriptor);
4705             continue;
4706           }
4707
4708           long storeSeqId = store.getMaxSequenceId();
4709           List<String> storeFiles = storeDescriptor.getStoreFileList();
4710           try {
4711             store.refreshStoreFiles(storeFiles); // replace the files with the new ones
4712           } catch (FileNotFoundException ex) {
4713             LOG.warn(getRegionInfo().getEncodedName() + " : "
4714                     + "At least one of the store files: " + storeFiles
4715                     + " doesn't exist any more. Skip loading the file(s)", ex);
4716             continue;
4717           }
4718           if (store.getMaxSequenceId() != storeSeqId) {
4719             // Record latest flush time if we picked up new files
4720             lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
4721           }
4722
4723           if (writestate.flushing) {
4724             // only drop memstore snapshots if they are smaller than last flush for the store
4725             if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
4726               StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4727                   null : this.prepareFlushResult.storeFlushCtxs.get(family);
4728               if (ctx != null) {
4729                 long snapshotSize = store.getFlushableSize();
4730                 ctx.abort();
4731                 this.addAndGetGlobalMemstoreSize(-snapshotSize);
4732                 this.prepareFlushResult.storeFlushCtxs.remove(family);
4733               }
4734             }
4735           }
4736
4737           // Drop the memstore contents if they are now smaller than the latest seen flushed file
4738           dropMemstoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
4739           if (storeSeqId > this.maxFlushedSeqId) {
4740             this.maxFlushedSeqId = storeSeqId;
4741           }
4742         }
4743
4744         // if all stores ended up dropping their snapshots, we can safely drop the
4745         // prepareFlushResult
4746         dropPrepareFlushIfPossible();
4747
4748         // advance the mvcc read point so that the new flushed file is visible.
4749         mvcc.await();
4750
4751         // If we were waiting for observing a flush or region opening event for not showing partial
4752         // data after a secondary region crash, we can allow reads now.
4753         this.setReadsEnabled(true);
4754
4755         // C. Finally notify anyone waiting on memstore to clear:
4756         // e.g. checkResources().
4757         synchronized (this) {
4758           notifyAll(); // FindBugs NN_NAKED_NOTIFY
4759         }
4760       }
4761       logRegionFiles();
4762     } finally {
4763       closeRegionOperation(Operation.REPLAY_EVENT);
4764     }
4765   }
4766
4767   void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
4768     checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
4769       "BulkLoad marker from WAL ", bulkLoadEvent);
4770
4771     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4772       return; // if primary nothing to do
4773     }
4774
4775     if (LOG.isDebugEnabled()) {
4776       LOG.debug(getRegionInfo().getEncodedName() + " : "
4777               +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
4778     }
4779     // check if multiple families involved
4780     boolean multipleFamilies = false;
4781     byte[] family = null;
4782     for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4783       byte[] fam = storeDescriptor.getFamilyName().toByteArray();
4784       if (family == null) {
4785         family = fam;
4786       } else if (!Bytes.equals(family, fam)) {
4787         multipleFamilies = true;
4788         break;
4789       }
4790     }
4791
4792     startBulkRegionOperation(multipleFamilies);
4793     try {
4794       // we will use writestate as a coarse-grain lock for all the replay events
4795       synchronized (writestate) {
4796         // Replication can deliver events out of order when primary region moves or the region
4797         // server crashes, since there is no coordination between replication of different wal files
4798         // belonging to different region servers. We have to safe guard against this case by using
4799         // region open event's seqid. Since this is the first event that the region puts (after
4800         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4801         // smaller than this seqId
4802         if (bulkLoadEvent.getBulkloadSeqNum() >= 0
4803             && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
4804           LOG.warn(getRegionInfo().getEncodedName() + " : "
4805               + "Skipping replaying bulkload event :"
4806               + TextFormat.shortDebugString(bulkLoadEvent)
4807               + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
4808               + " =" + lastReplayedOpenRegionSeqId);
4809
4810           return;
4811         }
4812
4813         for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4814           // stores of primary may be different now
4815           family = storeDescriptor.getFamilyName().toByteArray();
4816           Store store = getStore(family);
4817           if (store == null) {
4818             LOG.warn(getRegionInfo().getEncodedName() + " : "
4819                     + "Received a bulk load marker from primary, but the family is not found. "
4820                     + "Ignoring. StoreDescriptor:" + storeDescriptor);
4821             continue;
4822           }
4823
4824           List<String> storeFiles = storeDescriptor.getStoreFileList();
4825           for (String storeFile : storeFiles) {
4826             StoreFileInfo storeFileInfo = null;
4827             try {
4828               storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
4829               store.bulkLoadHFile(storeFileInfo);
4830             } catch(FileNotFoundException ex) {
4831               LOG.warn(getRegionInfo().getEncodedName() + " : "
4832                       + ((storeFileInfo != null) ? storeFileInfo.toString() :
4833                             (new Path(Bytes.toString(family), storeFile)).toString())
4834                       + " doesn't exist any more. Skip loading the file");
4835             }
4836           }
4837         }
4838       }
4839       if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
4840         mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum());
4841       }
4842     } finally {
4843       closeBulkRegionOperation();
4844     }
4845   }
4846
4847   /**
4848    * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
4849    */
4850   private void dropPrepareFlushIfPossible() {
4851     if (writestate.flushing) {
4852       boolean canDrop = true;
4853       if (prepareFlushResult.storeFlushCtxs != null) {
4854         for (Entry<byte[], StoreFlushContext> entry
4855             : prepareFlushResult.storeFlushCtxs.entrySet()) {
4856           Store store = getStore(entry.getKey());
4857           if (store == null) {
4858             continue;
4859           }
4860           if (store.getSnapshotSize() > 0) {
4861             canDrop = false;
4862             break;
4863           }
4864         }
4865       }
4866
4867       // this means that all the stores in the region has finished flushing, but the WAL marker
4868       // may not have been written or we did not receive it yet.
4869       if (canDrop) {
4870         writestate.flushing = false;
4871         this.prepareFlushResult = null;
4872       }
4873     }
4874   }
4875
4876   @Override
4877   public boolean refreshStoreFiles() throws IOException {
4878     return refreshStoreFiles(false);
4879   }
4880
4881   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NN_NAKED_NOTIFY",
4882       justification="Notify is about post replay. Intentional")
4883   protected boolean refreshStoreFiles(boolean force) throws IOException {
4884     if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4885       return false; // if primary nothing to do
4886     }
4887
4888     if (LOG.isDebugEnabled()) {
4889       LOG.debug(getRegionInfo().getEncodedName() + " : "
4890           + "Refreshing store files to see whether we can free up memstore");
4891     }
4892
4893     long totalFreedSize = 0;
4894
4895     long smallestSeqIdInStores = Long.MAX_VALUE;
4896
4897     startRegionOperation(); // obtain region close lock
4898     try {
4899       synchronized (writestate) {
4900         for (Store store : getStores()) {
4901           // TODO: some stores might see new data from flush, while others do not which
4902           // MIGHT break atomic edits across column families.
4903           long maxSeqIdBefore = store.getMaxSequenceId();
4904
4905           // refresh the store files. This is similar to observing a region open wal marker.
4906           store.refreshStoreFiles();
4907
4908           long storeSeqId = store.getMaxSequenceId();
4909           if (storeSeqId < smallestSeqIdInStores) {
4910             smallestSeqIdInStores = storeSeqId;
4911           }
4912
4913           // see whether we can drop the memstore or the snapshot
4914           if (storeSeqId > maxSeqIdBefore) {
4915
4916             if (writestate.flushing) {
4917               // only drop memstore snapshots if they are smaller than last flush for the store
4918               if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
4919                 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4920                     null : this.prepareFlushResult.storeFlushCtxs.get(store.getFamily().getName());
4921                 if (ctx != null) {
4922                   long snapshotSize = store.getFlushableSize();
4923                   ctx.abort();
4924                   this.addAndGetGlobalMemstoreSize(-snapshotSize);
4925                   this.prepareFlushResult.storeFlushCtxs.remove(store.getFamily().getName());
4926                   totalFreedSize += snapshotSize;
4927                 }
4928               }
4929             }
4930
4931             // Drop the memstore contents if they are now smaller than the latest seen flushed file
4932             totalFreedSize += dropMemstoreContentsForSeqId(storeSeqId, store);
4933           }
4934         }
4935
4936         // if all stores ended up dropping their snapshots, we can safely drop the
4937         // prepareFlushResult
4938         dropPrepareFlushIfPossible();
4939
4940         // advance the mvcc read point so that the new flushed files are visible.
4941           // either greater than flush seq number or they were already picked up via flush.
4942           for (Store s : getStores()) {
4943             mvcc.advanceTo(s.getMaxMemstoreTS());
4944           }
4945
4946
4947         // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
4948         // skip all edits that are to be replayed in the future with that has a smaller seqId
4949         // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
4950         // that we have picked the flush files for
4951         if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
4952           this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
4953         }
4954       }
4955       // C. Finally notify anyone waiting on memstore to clear:
4956       // e.g. checkResources().
4957       synchronized (this) {
4958         notifyAll(); // FindBugs NN_NAKED_NOTIFY
4959       }
4960       return totalFreedSize > 0;
4961     } finally {
4962       closeRegionOperation();
4963     }
4964   }
4965
4966   private void logRegionFiles() {
4967     if (LOG.isTraceEnabled()) {
4968       LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
4969       for (Store s : stores.values()) {
4970         Collection<StoreFile> storeFiles = s.getStorefiles();
4971         if (storeFiles == null) continue;
4972         for (StoreFile sf : storeFiles) {
4973           LOG.trace(getRegionInfo().getEncodedName() + " : " + sf);
4974         }
4975       }
4976     }
4977   }
4978
4979   /** Checks whether the given regionName is either equal to our region, or that
4980    * the regionName is the primary region to our corresponding range for the secondary replica.
4981    */
4982   private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
4983       throws WrongRegionException {
4984     if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
4985       return;
4986     }
4987
4988     if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
4989         Bytes.equals(encodedRegionName,
4990           this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
4991       return;
4992     }
4993
4994     throw new WrongRegionException(exceptionMsg + payload
4995       + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
4996       + " does not match this region: " + this.getRegionInfo());
4997   }
4998
4999   /**
5000    * Used by tests
5001    * @param s Store to add edit too.
5002    * @param cell Cell to add.
5003    * @return True if we should flush.
5004    */
5005   protected boolean restoreEdit(final Store s, final Cell cell) {
5006     long kvSize = s.add(cell);
5007     if (this.rsAccounting != null) {
5008       rsAccounting.addAndGetRegionReplayEditsSize(getRegionInfo().getRegionName(), kvSize);
5009     }
5010     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
5011   }
5012
5013   /*
5014    * @param fs
5015    * @param p File to check.
5016    * @return True if file was zero-length (and if so, we'll delete it in here).
5017    * @throws IOException
5018    */
5019   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
5020       throws IOException {
5021     FileStatus stat = fs.getFileStatus(p);
5022     if (stat.getLen() > 0) return false;
5023     LOG.warn("File " + p + " is zero-length, deleting.");
5024     fs.delete(p, false);
5025     return true;
5026   }
5027
5028   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
5029     if (family.isMobEnabled()) {
5030       if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
5031         throw new IOException("A minimum HFile version of "
5032             + HFile.MIN_FORMAT_VERSION_WITH_TAGS
5033             + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY
5034             + " accordingly.");
5035       }
5036       return new HMobStore(this, family, this.conf);
5037     }
5038     return new HStore(this, family, this.conf);
5039   }
5040
5041   @Override
5042   public Store getStore(final byte[] column) {
5043     return this.stores.get(column);
5044   }
5045
5046   /**
5047    * Return HStore instance. Does not do any copy: as the number of store is limited, we
5048    *  iterate on the list.
5049    */
5050   private Store getStore(Cell cell) {
5051     for (Map.Entry<byte[], Store> famStore : stores.entrySet()) {
5052       if (Bytes.equals(
5053           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
5054           famStore.getKey(), 0, famStore.getKey().length)) {
5055         return famStore.getValue();
5056       }
5057     }
5058
5059     return null;
5060   }
5061
5062   @Override
5063   public List<Store> getStores() {
5064     List<Store> list = new ArrayList<Store>(stores.size());
5065     list.addAll(stores.values());
5066     return list;
5067   }
5068
5069   @Override
5070   public List<String> getStoreFileList(final byte [][] columns)
5071     throws IllegalArgumentException {
5072     List<String> storeFileNames = new ArrayList<String>();
5073     synchronized(closeLock) {
5074       for(byte[] column : columns) {
5075         Store store = this.stores.get(column);
5076         if (store == null) {
5077           throw new IllegalArgumentException("No column family : " +
5078               new String(column) + " available");
5079         }
5080         Collection<StoreFile> storeFiles = store.getStorefiles();
5081         if (storeFiles == null) continue;
5082         for (StoreFile storeFile: storeFiles) {
5083           storeFileNames.add(storeFile.getPath().toString());
5084         }
5085
5086         logRegionFiles();
5087       }
5088     }
5089     return storeFileNames;
5090   }
5091
5092   //////////////////////////////////////////////////////////////////////////////
5093   // Support code
5094   //////////////////////////////////////////////////////////////////////////////
5095
5096   /** Make sure this is a valid row for the HRegion */
5097   void checkRow(final byte [] row, String op) throws IOException {
5098     if (!rowIsInRange(getRegionInfo(), row)) {
5099       throw new WrongRegionException("Requested row out of range for " +
5100           op + " on HRegion " + this + ", startKey='" +
5101           Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" +
5102           Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" +
5103           Bytes.toStringBinary(row) + "'");
5104     }
5105   }
5106
5107
5108   /**
5109    * Get an exclusive ( write lock ) lock on a given row.
5110    * @param row Which row to lock.
5111    * @return A locked RowLock. The lock is exclusive and already aqquired.
5112    * @throws IOException
5113    */
5114   public RowLock getRowLock(byte[] row) throws IOException {
5115     return getRowLock(row, false);
5116   }
5117
5118   /**
5119    *
5120    * Get a row lock for the specified row. All locks are reentrant.
5121    *
5122    * Before calling this function make sure that a region operation has already been
5123    * started (the calling thread has already acquired the region-close-guard lock).
5124    * @param row The row actions will be performed against
5125    * @param readLock is the lock reader or writer. True indicates that a non-exlcusive
5126    *                 lock is requested
5127    */
5128   @Override
5129   public RowLock getRowLock(byte[] row, boolean readLock) throws IOException {
5130     checkRow(row, "row lock");
5131     return getRowLockInternal(row, readLock);
5132   }
5133
5134   protected RowLock getRowLockInternal(byte[] row, boolean readLock) throws IOException {
5135     // create an object to use a a key in the row lock map
5136     HashedBytes rowKey = new HashedBytes(row);
5137
5138     RowLockContext rowLockContext = null;
5139     RowLockImpl result = null;
5140     TraceScope traceScope = null;
5141
5142     // If we're tracing start a span to show how long this took.
5143     if (Trace.isTracing()) {
5144       traceScope = Trace.startSpan("HRegion.getRowLock");
5145       traceScope.getSpan().addTimelineAnnotation("Getting a " + (readLock?"readLock":"writeLock"));
5146     }
5147
5148     try {
5149       // Keep trying until we have a lock or error out.
5150       // TODO: do we need to add a time component here?
5151       while (result == null) {
5152
5153         // Try adding a RowLockContext to the lockedRows.
5154         // If we can add it then there's no other transactions currently running.
5155         rowLockContext = new RowLockContext(rowKey);
5156         RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
5157
5158         // if there was a running transaction then there's already a context.
5159         if (existingContext != null) {
5160           rowLockContext = existingContext;
5161         }
5162
5163         // Now try an get the lock.
5164         //
5165         // This can fail as
5166         if (readLock) {
5167           result = rowLockContext.newReadLock();
5168         } else {
5169           result = rowLockContext.newWriteLock();
5170         }
5171       }
5172       if (!result.getLock().tryLock(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
5173         if (traceScope != null) {
5174           traceScope.getSpan().addTimelineAnnotation("Failed to get row lock");
5175         }
5176         result = null;
5177         // Clean up the counts just in case this was the thing keeping the context alive.
5178         rowLockContext.cleanUp();
5179         throw new IOException("Timed out waiting for lock for row: " + rowKey);
5180       }
5181       rowLockContext.setThreadName(Thread.currentThread().getName());
5182       return result;
5183     } catch (InterruptedException ie) {
5184       LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
5185       InterruptedIOException iie = new InterruptedIOException();
5186       iie.initCause(ie);
5187       if (traceScope != null) {
5188         traceScope.getSpan().addTimelineAnnotation("Interrupted exception getting row lock");
5189       }
5190       Thread.currentThread().interrupt();
5191       throw iie;
5192     } finally {
5193       if (traceScope != null) {
5194         traceScope.close();
5195       }
5196     }
5197   }
5198
5199   @Override
5200   public void releaseRowLocks(List<RowLock> rowLocks) {
5201     if (rowLocks != null) {
5202       for (RowLock rowLock : rowLocks) {
5203         rowLock.release();
5204       }
5205       rowLocks.clear();
5206     }
5207   }
5208
5209   public ConcurrentHashMap<HashedBytes, RowLockContext> getLockedRows() {
5210     return lockedRows;
5211   }
5212
5213   @VisibleForTesting
5214   class RowLockContext {
5215     private final HashedBytes row;
5216     final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true);
5217     final AtomicBoolean usable = new AtomicBoolean(true);
5218     final AtomicInteger count = new AtomicInteger(0);
5219     final Object lock = new Object();
5220     private String threadName;
5221
5222     RowLockContext(HashedBytes row) {
5223       this.row = row;
5224     }
5225
5226     RowLockImpl newWriteLock() {
5227       Lock l = readWriteLock.writeLock();
5228       return getRowLock(l);
5229     }
5230     RowLockImpl newReadLock() {
5231       Lock l = readWriteLock.readLock();
5232       return getRowLock(l);
5233     }
5234
5235     private RowLockImpl getRowLock(Lock l) {
5236       count.incrementAndGet();
5237       synchronized (lock) {
5238         if (usable.get()) {
5239           return new RowLockImpl(this, l);
5240         } else {
5241           return null;
5242         }
5243       }
5244     }
5245
5246     void cleanUp() {
5247       long c = count.decrementAndGet();
5248       if (c <= 0) {
5249         synchronized (lock) {
5250           if (count.get() <= 0){
5251             usable.set(false);
5252             RowLockContext removed = lockedRows.remove(row);
5253             assert removed == this: "we should never remove a different context";
5254           }
5255         }
5256       }
5257     }
5258
5259     public void setThreadName(String threadName) {
5260       this.threadName = threadName;
5261     }
5262
5263     @Override
5264     public String toString() {
5265       return "RowLockContext{" +
5266           "row=" + row +
5267           ", readWriteLock=" + readWriteLock +
5268           ", count=" + count +
5269           ", threadName=" + threadName +
5270           '}';
5271     }
5272   }
5273
5274   /**
5275    * Class used to represent a lock on a row.
5276    */
5277   public static class RowLockImpl implements RowLock {
5278     private final RowLockContext context;
5279     private final Lock lock;
5280
5281     public RowLockImpl(RowLockContext context, Lock lock) {
5282       this.context = context;
5283       this.lock = lock;
5284     }
5285
5286     public Lock getLock() {
5287       return lock;
5288     }
5289
5290     @VisibleForTesting
5291     public RowLockContext getContext() {
5292       return context;
5293     }
5294
5295     @Override
5296     public void release() {
5297       lock.unlock();
5298       context.cleanUp();
5299     }
5300
5301     @Override
5302     public String toString() {
5303       return "RowLockImpl{" +
5304           "context=" + context +
5305           ", lock=" + lock +
5306           '}';
5307     }
5308   }
5309
5310   /**
5311    * Determines whether multiple column families are present
5312    * Precondition: familyPaths is not null
5313    *
5314    * @param familyPaths List of (column family, hfilePath)
5315    */
5316   private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) {
5317     boolean multipleFamilies = false;
5318     byte[] family = null;
5319     for (Pair<byte[], String> pair : familyPaths) {
5320       byte[] fam = pair.getFirst();
5321       if (family == null) {
5322         family = fam;
5323       } else if (!Bytes.equals(family, fam)) {
5324         multipleFamilies = true;
5325         break;
5326       }
5327     }
5328     return multipleFamilies;
5329   }
5330
5331   @Override
5332   public boolean bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, boolean assignSeqId,
5333       BulkLoadListener bulkLoadListener) throws IOException {
5334     long seqId = -1;
5335     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
5336     Map<String, Long> storeFilesSizes = new HashMap<String, Long>();
5337     Preconditions.checkNotNull(familyPaths);
5338     // we need writeLock for multi-family bulk load
5339     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
5340     boolean isSuccessful = false;
5341     try {
5342       this.writeRequestsCount.increment();
5343
5344       // There possibly was a split that happened between when the split keys
5345       // were gathered and before the HRegion's write lock was taken.  We need
5346       // to validate the HFile region before attempting to bulk load all of them
5347       List<IOException> ioes = new ArrayList<IOException>();
5348       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
5349       for (Pair<byte[], String> p : familyPaths) {
5350         byte[] familyName = p.getFirst();
5351         String path = p.getSecond();
5352
5353         Store store = getStore(familyName);
5354         if (store == null) {
5355           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
5356               "No such column family " + Bytes.toStringBinary(familyName));
5357           ioes.add(ioe);
5358         } else {
5359           try {
5360             store.assertBulkLoadHFileOk(new Path(path));
5361           } catch (WrongRegionException wre) {
5362             // recoverable (file doesn't fit in region)
5363             failures.add(p);
5364           } catch (IOException ioe) {
5365             // unrecoverable (hdfs problem)
5366             ioes.add(ioe);
5367           }
5368         }
5369       }
5370
5371       // validation failed because of some sort of IO problem.
5372       if (ioes.size() != 0) {
5373         IOException e = MultipleIOException.createIOException(ioes);
5374         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
5375         throw e;
5376       }
5377
5378       // validation failed, bail out before doing anything permanent.
5379       if (failures.size() != 0) {
5380         StringBuilder list = new StringBuilder();
5381         for (Pair<byte[], String> p : failures) {
5382           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
5383               .append(p.getSecond());
5384         }
5385         // problem when validating
5386         LOG.warn("There was a recoverable bulk load failure likely due to a" +
5387             " split.  These (family, HFile) pairs were not loaded: " + list);
5388         return isSuccessful;
5389       }
5390
5391       // We need to assign a sequential ID that's in between two memstores in order to preserve
5392       // the guarantee that all the edits lower than the highest sequential ID from all the
5393       // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
5394       // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
5395       // a sequence id that we can be sure is beyond the last hfile written).
5396       if (assignSeqId) {
5397         FlushResult fs = flushcache(true, false);
5398         if (fs.isFlushSucceeded()) {
5399           seqId = ((FlushResultImpl)fs).flushSequenceId;
5400         } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
5401           seqId = ((FlushResultImpl)fs).flushSequenceId;
5402         } else {
5403           throw new IOException("Could not bulk load with an assigned sequential ID because the "+
5404             "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason);
5405         }
5406       }
5407
5408       for (Pair<byte[], String> p : familyPaths) {
5409         byte[] familyName = p.getFirst();
5410         String path = p.getSecond();
5411         Store store = getStore(familyName);
5412         try {
5413           String finalPath = path;
5414           if (bulkLoadListener != null) {
5415             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
5416           }
5417           Path commitedStoreFile = store.bulkLoadHFile(finalPath, seqId);
5418
5419           // Note the size of the store file
5420           try {
5421             FileSystem fs = commitedStoreFile.getFileSystem(baseConf);
5422             storeFilesSizes.put(commitedStoreFile.getName(), fs.getFileStatus(commitedStoreFile)
5423                 .getLen());
5424           } catch (IOException e) {
5425             LOG.warn("Failed to find the size of hfile " + commitedStoreFile);
5426             storeFilesSizes.put(commitedStoreFile.getName(), 0L);
5427           }
5428
5429           if(storeFiles.containsKey(familyName)) {
5430             storeFiles.get(familyName).add(commitedStoreFile);
5431           } else {
5432             List<Path> storeFileNames = new ArrayList<Path>();
5433             storeFileNames.add(commitedStoreFile);
5434             storeFiles.put(familyName, storeFileNames);
5435           }
5436           if (bulkLoadListener != null) {
5437             bulkLoadListener.doneBulkLoad(familyName, path);
5438           }
5439         } catch (IOException ioe) {
5440           // A failure here can cause an atomicity violation that we currently
5441           // cannot recover from since it is likely a failed HDFS operation.
5442
5443           // TODO Need a better story for reverting partial failures due to HDFS.
5444           LOG.error("There was a partial failure due to IO when attempting to" +
5445               " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
5446           if (bulkLoadListener != null) {
5447             try {
5448               bulkLoadListener.failedBulkLoad(familyName, path);
5449             } catch (Exception ex) {
5450               LOG.error("Error while calling failedBulkLoad for family " +
5451                   Bytes.toString(familyName) + " with path " + path, ex);
5452             }
5453           }
5454           throw ioe;
5455         }
5456       }
5457
5458       isSuccessful = true;
5459     } finally {
5460       if (wal != null && !storeFiles.isEmpty()) {
5461         // Write a bulk load event for hfiles that are loaded
5462         try {
5463           WALProtos.BulkLoadDescriptor loadDescriptor =
5464               ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(),
5465                 ByteStringer.wrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles,
5466                 storeFilesSizes, seqId);
5467           WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(),
5468               loadDescriptor, mvcc);
5469         } catch (IOException ioe) {
5470           if (this.rsServices != null) {
5471             // Have to abort region server because some hfiles has been loaded but we can't write
5472             // the event into WAL
5473             isSuccessful = false;
5474             this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
5475           }
5476         }
5477       }
5478
5479       closeBulkRegionOperation();
5480     }
5481     return isSuccessful;
5482   }
5483
5484   @Override
5485   public boolean equals(Object o) {
5486     return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
5487                                                 ((HRegion) o).getRegionInfo().getRegionName());
5488   }
5489
5490   @Override
5491   public int hashCode() {
5492     return Bytes.hashCode(getRegionInfo().getRegionName());
5493   }
5494
5495   @Override
5496   public String toString() {
5497     return getRegionInfo().getRegionNameAsString();
5498   }
5499
5500   /**
5501    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
5502    */
5503   class RegionScannerImpl implements RegionScanner, org.apache.hadoop.hbase.ipc.RpcCallback {
5504     // Package local for testability
5505     KeyValueHeap storeHeap = null;
5506     /** Heap of key-values that are not essential for the provided filters and are thus read
5507      * on demand, if on-demand column family loading is enabled.*/
5508     KeyValueHeap joinedHeap = null;
5509     /**
5510      * If the joined heap data gathering is interrupted due to scan limits, this will
5511      * contain the row for which we are populating the values.*/
5512     protected Cell joinedContinuationRow = null;
5513     private boolean filterClosed = false;
5514
5515     protected final int isScan;
5516     protected final byte[] stopRow;
5517     protected final HRegion region;
5518     protected final CellComparator comparator;
5519
5520     private final long readPt;
5521     private final long maxResultSize;
5522     private final ScannerContext defaultScannerContext;
5523     private final FilterWrapper filter;
5524
5525     @Override
5526     public HRegionInfo getRegionInfo() {
5527       return region.getRegionInfo();
5528     }
5529
5530     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
5531         throws IOException {
5532       this.region = region;
5533       this.maxResultSize = scan.getMaxResultSize();
5534       if (scan.hasFilter()) {
5535         this.filter = new FilterWrapper(scan.getFilter());
5536       } else {
5537         this.filter = null;
5538       }
5539       this.comparator = region.getCellCompartor();
5540       /**
5541        * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default
5542        * scanner context that can be used to enforce the batch limit in the event that a
5543        * ScannerContext is not specified during an invocation of next/nextRaw
5544        */
5545       defaultScannerContext = ScannerContext.newBuilder()
5546           .setBatchLimit(scan.getBatch()).build();
5547
5548       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
5549         this.stopRow = null;
5550       } else {
5551         this.stopRow = scan.getStopRow();
5552       }
5553       // If we are doing a get, we want to be [startRow,endRow]. Normally
5554       // it is [startRow,endRow) and if startRow=endRow we get nothing.
5555       this.isScan = scan.isGetScan() ? 1 : 0;
5556
5557       // synchronize on scannerReadPoints so that nobody calculates
5558       // getSmallestReadPoint, before scannerReadPoints is updated.
5559       IsolationLevel isolationLevel = scan.getIsolationLevel();
5560       synchronized(scannerReadPoints) {
5561         this.readPt = getReadPoint(isolationLevel);
5562         scannerReadPoints.put(this, this.readPt);
5563       }
5564
5565       // Here we separate all scanners into two lists - scanner that provide data required
5566       // by the filter to operate (scanners list) and all others (joinedScanners list).
5567       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5568       List<KeyValueScanner> joinedScanners
5569         = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5570       if (additionalScanners != null) {
5571         scanners.addAll(additionalScanners);
5572       }
5573
5574       for (Map.Entry<byte[], NavigableSet<byte[]>> entry : scan.getFamilyMap().entrySet()) {
5575         Store store = stores.get(entry.getKey());
5576         KeyValueScanner scanner;
5577         try {
5578           scanner = store.getScanner(scan, entry.getValue(), this.readPt);
5579         } catch (FileNotFoundException e) {
5580           throw handleFileNotFound(e);
5581         }
5582         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
5583           || this.filter.isFamilyEssential(entry.getKey())) {
5584           scanners.add(scanner);
5585         } else {
5586           joinedScanners.add(scanner);
5587         }
5588       }
5589       initializeKVHeap(scanners, joinedScanners, region);
5590     }
5591
5592     protected void initializeKVHeap(List<KeyValueScanner> scanners,
5593         List<KeyValueScanner> joinedScanners, HRegion region)
5594         throws IOException {
5595       this.storeHeap = new KeyValueHeap(scanners, comparator);
5596       if (!joinedScanners.isEmpty()) {
5597         this.joinedHeap = new KeyValueHeap(joinedScanners, comparator);
5598       }
5599     }
5600
5601     @Override
5602     public long getMaxResultSize() {
5603       return maxResultSize;
5604     }
5605
5606     @Override
5607     public long getMvccReadPoint() {
5608       return this.readPt;
5609     }
5610
5611     @Override
5612     public int getBatch() {
5613       return this.defaultScannerContext.getBatchLimit();
5614     }
5615
5616     /**
5617      * Reset both the filter and the old filter.
5618      *
5619      * @throws IOException in case a filter raises an I/O exception.
5620      */
5621     protected void resetFilters() throws IOException {
5622       if (filter != null) {
5623         filter.reset();
5624       }
5625     }
5626
5627     @Override
5628     public boolean next(List<Cell> outResults)
5629         throws IOException {
5630       // apply the batching limit by default
5631       return next(outResults, defaultScannerContext);
5632     }
5633
5634     @Override
5635     public synchronized boolean next(List<Cell> outResults, ScannerContext scannerContext)
5636     throws IOException {
5637       if (this.filterClosed) {
5638         throw new UnknownScannerException("Scanner was closed (timed out?) " +
5639             "after we renewed it. Could be caused by a very slow scanner " +
5640             "or a lengthy garbage collection");
5641       }
5642       startRegionOperation(Operation.SCAN);
5643       readRequestsCount.increment();
5644       try {
5645         return nextRaw(outResults, scannerContext);
5646       } finally {
5647         closeRegionOperation(Operation.SCAN);
5648       }
5649     }
5650
5651     @Override
5652     public boolean nextRaw(List<Cell> outResults) throws IOException {
5653       // Use the RegionScanner's context by default
5654       return nextRaw(outResults, defaultScannerContext);
5655     }
5656
5657     @Override
5658     public boolean nextRaw(List<Cell> outResults, ScannerContext scannerContext)
5659         throws IOException {
5660       if (storeHeap == null) {
5661         // scanner is closed
5662         throw new UnknownScannerException("Scanner was closed");
5663       }
5664       boolean moreValues = false;
5665       if (outResults.isEmpty()) {
5666         // Usually outResults is empty. This is true when next is called
5667         // to handle scan or get operation.
5668         moreValues = nextInternal(outResults, scannerContext);
5669       } else {
5670         List<Cell> tmpList = new ArrayList<Cell>();
5671         moreValues = nextInternal(tmpList, scannerContext);
5672         outResults.addAll(tmpList);
5673       }
5674
5675       // If the size limit was reached it means a partial Result is being
5676       // returned. Returning a
5677       // partial Result means that we should not reset the filters; filters
5678       // should only be reset in
5679       // between rows
5680       if (!scannerContext.midRowResultFormed())
5681         resetFilters();
5682
5683       if (isFilterDoneInternal()) {
5684         moreValues = false;
5685       }
5686       return moreValues;
5687     }
5688
5689     /**
5690      * @return true if more cells exist after this batch, false if scanner is done
5691      */
5692     private boolean populateFromJoinedHeap(List<Cell> results, ScannerContext scannerContext)
5693             throws IOException {
5694       assert joinedContinuationRow != null;
5695       boolean moreValues = populateResult(results, this.joinedHeap, scannerContext,
5696           joinedContinuationRow);
5697
5698       if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5699         // We are done with this row, reset the continuation.
5700         joinedContinuationRow = null;
5701       }
5702       // As the data is obtained from two independent heaps, we need to
5703       // ensure that result list is sorted, because Result relies on that.
5704       sort(results, comparator);
5705       return moreValues;
5706     }
5707
5708     /**
5709      * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
5710      * reached, or remainingResultSize (if not -1) is reaced
5711      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
5712      * @param scannerContext
5713      * @param currentRowCell
5714      * @return state of last call to {@link KeyValueHeap#next()}
5715      */
5716     private boolean populateResult(List<Cell> results, KeyValueHeap heap,
5717         ScannerContext scannerContext, Cell currentRowCell) throws IOException {
5718       Cell nextKv;
5719       boolean moreCellsInRow = false;
5720       boolean tmpKeepProgress = scannerContext.getKeepProgress();
5721       // Scanning between column families and thus the scope is between cells
5722       LimitScope limitScope = LimitScope.BETWEEN_CELLS;
5723       try {
5724         do {
5725           // We want to maintain any progress that is made towards the limits while scanning across
5726           // different column families. To do this, we toggle the keep progress flag on during calls
5727           // to the StoreScanner to ensure that any progress made thus far is not wiped away.
5728           scannerContext.setKeepProgress(true);
5729           heap.next(results, scannerContext);
5730           scannerContext.setKeepProgress(tmpKeepProgress);
5731
5732           nextKv = heap.peek();
5733           moreCellsInRow = moreCellsInRow(nextKv, currentRowCell);
5734           if (!moreCellsInRow) incrementCountOfRowsScannedMetric(scannerContext);
5735           if (moreCellsInRow && scannerContext.checkBatchLimit(limitScope)) {
5736             return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues();
5737           } else if (scannerContext.checkSizeLimit(limitScope)) {
5738             ScannerContext.NextState state =
5739               moreCellsInRow? NextState.SIZE_LIMIT_REACHED_MID_ROW: NextState.SIZE_LIMIT_REACHED;
5740             return scannerContext.setScannerState(state).hasMoreValues();
5741           } else if (scannerContext.checkTimeLimit(limitScope)) {
5742             ScannerContext.NextState state =
5743               moreCellsInRow? NextState.TIME_LIMIT_REACHED_MID_ROW: NextState.TIME_LIMIT_REACHED;
5744             return scannerContext.setScannerState(state).hasMoreValues();
5745           }
5746         } while (moreCellsInRow);
5747       } catch (FileNotFoundException e) {
5748         throw handleFileNotFound(e);
5749       }
5750       return nextKv != null;
5751     }
5752
5753     /**
5754      * Based on the nextKv in the heap, and the current row, decide whether or not there are more
5755      * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
5756      * then there are more cells to be read in the row.
5757      * @param nextKv
5758      * @param currentRowCell
5759      * @return true When there are more cells in the row to be read
5760      */
5761     private boolean moreCellsInRow(final Cell nextKv, Cell currentRowCell) {
5762       return nextKv != null && CellUtil.matchingRow(nextKv, currentRowCell);
5763     }
5764
5765     /*
5766      * @return True if a filter rules the scanner is over, done.
5767      */
5768     @Override
5769     public synchronized boolean isFilterDone() throws IOException {
5770       return isFilterDoneInternal();
5771     }
5772
5773     private boolean isFilterDoneInternal() throws IOException {
5774       return this.filter != null && this.filter.filterAllRemaining();
5775     }
5776
5777     private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
5778         throws IOException {
5779       if (!results.isEmpty()) {
5780         throw new IllegalArgumentException("First parameter should be an empty list");
5781       }
5782       if (scannerContext == null) {
5783         throw new IllegalArgumentException("Scanner context cannot be null");
5784       }
5785       RpcCallContext rpcCall = RpcServer.getCurrentCall();
5786
5787       // Save the initial progress from the Scanner context in these local variables. The progress
5788       // may need to be reset a few times if rows are being filtered out so we save the initial
5789       // progress.
5790       int initialBatchProgress = scannerContext.getBatchProgress();
5791       long initialSizeProgress = scannerContext.getSizeProgress();
5792       long initialTimeProgress = scannerContext.getTimeProgress();
5793
5794       // The loop here is used only when at some point during the next we determine
5795       // that due to effects of filters or otherwise, we have an empty row in the result.
5796       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
5797       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
5798       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
5799       while (true) {
5800         // Starting to scan a new row. Reset the scanner progress according to whether or not
5801         // progress should be kept.
5802         if (scannerContext.getKeepProgress()) {
5803           // Progress should be kept. Reset to initial values seen at start of method invocation.
5804           scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
5805             initialTimeProgress);
5806         } else {
5807           scannerContext.clearProgress();
5808         }
5809
5810         if (rpcCall != null) {
5811           // If a user specifies a too-restrictive or too-slow scanner, the
5812           // client might time out and disconnect while the server side
5813           // is still processing the request. We should abort aggressively
5814           // in that case.
5815           long afterTime = rpcCall.disconnectSince();
5816           if (afterTime >= 0) {
5817             throw new CallerDisconnectedException(
5818                 "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " +
5819                     this + " after " + afterTime + " ms, since " +
5820                     "caller disconnected");
5821           }
5822         }
5823
5824         // Let's see what we have in the storeHeap.
5825         Cell current = this.storeHeap.peek();
5826
5827         boolean stopRow = isStopRow(current);
5828         // When has filter row is true it means that the all the cells for a particular row must be
5829         // read before a filtering decision can be made. This means that filters where hasFilterRow
5830         // run the risk of encountering out of memory errors in the case that they are applied to a
5831         // table that has very large rows.
5832         boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();
5833
5834         // If filter#hasFilterRow is true, partial results are not allowed since allowing them
5835         // would prevent the filters from being evaluated. Thus, if it is true, change the
5836         // scope of any limits that could potentially create partial results to
5837         // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row
5838         if (hasFilterRow) {
5839           if (LOG.isTraceEnabled()) {
5840             LOG.trace("filter#hasFilterRow is true which prevents partial results from being "
5841                 + " formed. Changing scope of limits that may create partials");
5842           }
5843           scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS);
5844           scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS);
5845         }
5846
5847         // Check if we were getting data from the joinedHeap and hit the limit.
5848         // If not, then it's main path - getting results from storeHeap.
5849         if (joinedContinuationRow == null) {
5850           // First, check if we are at a stop row. If so, there are no more results.
5851           if (stopRow) {
5852             if (hasFilterRow) {
5853               filter.filterRowCells(results);
5854             }
5855             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5856           }
5857
5858           // Check if rowkey filter wants to exclude this row. If so, loop to next.
5859           // Technically, if we hit limits before on this row, we don't need this call.
5860           if (filterRowKey(current)) {
5861             incrementCountOfRowsFilteredMetric(scannerContext);
5862             // Typically the count of rows scanned is incremented inside #populateResult. However,
5863             // here we are filtering a row based purely on its row key, preventing us from calling
5864             // #populateResult. Thus, perform the necessary increment here to rows scanned metric
5865             incrementCountOfRowsScannedMetric(scannerContext);
5866             boolean moreRows = nextRow(scannerContext, current);
5867             if (!moreRows) {
5868               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5869             }
5870             results.clear();
5871             continue;
5872           }
5873
5874           // Ok, we are good, let's try to get some results from the main heap.
5875           populateResult(results, this.storeHeap, scannerContext, current);
5876
5877           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5878             if (hasFilterRow) {
5879               throw new IncompatibleFilterException(
5880                   "Filter whose hasFilterRow() returns true is incompatible with scans that must "
5881                       + " stop mid-row because of a limit. ScannerContext:" + scannerContext);
5882             }
5883             return true;
5884           }
5885
5886           Cell nextKv = this.storeHeap.peek();
5887           stopRow = nextKv == null || isStopRow(nextKv);
5888           // save that the row was empty before filters applied to it.
5889           final boolean isEmptyRow = results.isEmpty();
5890
5891           // We have the part of the row necessary for filtering (all of it, usually).
5892           // First filter with the filterRow(List).
5893           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
5894           if (hasFilterRow) {
5895             ret = filter.filterRowCellsWithRet(results);
5896
5897             // We don't know how the results have changed after being filtered. Must set progress
5898             // according to contents of results now. However, a change in the results should not
5899             // affect the time progress. Thus preserve whatever time progress has been made
5900             long timeProgress = scannerContext.getTimeProgress();
5901             if (scannerContext.getKeepProgress()) {
5902               scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
5903                 initialTimeProgress);
5904             } else {
5905               scannerContext.clearProgress();
5906             }
5907             scannerContext.setTimeProgress(timeProgress);
5908             scannerContext.incrementBatchProgress(results.size());
5909             for (Cell cell : results) {
5910               scannerContext.incrementSizeProgress(CellUtil.estimatedHeapSizeOf(cell));
5911             }
5912           }
5913
5914           if (isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE || filterRow()) {
5915             incrementCountOfRowsFilteredMetric(scannerContext);
5916             results.clear();
5917             boolean moreRows = nextRow(scannerContext, current);
5918             if (!moreRows) {
5919               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5920             }
5921
5922             // This row was totally filtered out, if this is NOT the last row,
5923             // we should continue on. Otherwise, nothing else to do.
5924             if (!stopRow) continue;
5925             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5926           }
5927
5928           // Ok, we are done with storeHeap for this row.
5929           // Now we may need to fetch additional, non-essential data into row.
5930           // These values are not needed for filter to work, so we postpone their
5931           // fetch to (possibly) reduce amount of data loads from disk.
5932           if (this.joinedHeap != null) {
5933             boolean mayHaveData = joinedHeapMayHaveData(current);
5934             if (mayHaveData) {
5935               joinedContinuationRow = current;
5936               populateFromJoinedHeap(results, scannerContext);
5937
5938               if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5939                 return true;
5940               }
5941             }
5942           }
5943         } else {
5944           // Populating from the joined heap was stopped by limits, populate some more.
5945           populateFromJoinedHeap(results, scannerContext);
5946           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5947             return true;
5948           }
5949         }
5950         // We may have just called populateFromJoinedMap and hit the limits. If that is
5951         // the case, we need to call it again on the next next() invocation.
5952         if (joinedContinuationRow != null) {
5953           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
5954         }
5955
5956         // Finally, we are done with both joinedHeap and storeHeap.
5957         // Double check to prevent empty rows from appearing in result. It could be
5958         // the case when SingleColumnValueExcludeFilter is used.
5959         if (results.isEmpty()) {
5960           incrementCountOfRowsFilteredMetric(scannerContext);
5961           boolean moreRows = nextRow(scannerContext, current);
5962           if (!moreRows) {
5963             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5964           }
5965           if (!stopRow) continue;
5966         }
5967
5968         if (stopRow) {
5969           return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
5970         } else {
5971           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
5972         }
5973       }
5974     }
5975
5976     protected void incrementCountOfRowsFilteredMetric(ScannerContext scannerContext) {
5977       filteredReadRequestsCount.increment();
5978
5979       if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
5980
5981       scannerContext.getMetrics().countOfRowsFiltered.incrementAndGet();
5982     }
5983 
5984     protected void incrementCountOfRowsScannedMetric(ScannerContext scannerContext) {
5985       if (scannerContext == null || !scannerContext.isTrackingMetrics()) return;
5986
5987       scannerContext.getMetrics().countOfRowsScanned.incrementAndGet();
5988     }
5989
5990     /**
5991      * @param currentRowCell
5992      * @return true when the joined heap may have data for the current row
5993      * @throws IOException
5994      */
5995     private boolean joinedHeapMayHaveData(Cell currentRowCell)
5996         throws IOException {
5997       Cell nextJoinedKv = joinedHeap.peek();
5998       boolean matchCurrentRow =
5999           nextJoinedKv != null && CellUtil.matchingRow(nextJoinedKv, currentRowCell);
6000       boolean matchAfterSeek = false;
6001
6002       // If the next value in the joined heap does not match the current row, try to seek to the
6003       // correct row
6004       if (!matchCurrentRow) {
6005         Cell firstOnCurrentRow = CellUtil.createFirstOnRow(currentRowCell);
6006         boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true);
6007         matchAfterSeek =
6008             seekSuccessful && joinedHeap.peek() != null
6009                 && CellUtil.matchingRow(joinedHeap.peek(), currentRowCell);
6010       }
6011
6012       return matchCurrentRow || matchAfterSeek;
6013     }
6014
6015     /**
6016      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
6017      * both filterRow & filterRow({@code List<KeyValue> kvs}) functions. While 0.94 code or older,
6018      * it may not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only
6019      * returns true when filterRow({@code List<KeyValue> kvs}) is overridden not the filterRow().
6020      * Therefore, the filterRow() will be skipped.
6021      */
6022     private boolean filterRow() throws IOException {
6023       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
6024       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
6025       return filter != null && (!filter.hasFilterRow())
6026           && filter.filterRow();
6027     }
6028
6029     private boolean filterRowKey(Cell current) throws IOException {
6030       return filter != null && filter.filterRowKey(current);
6031     }
6032
6033     protected boolean nextRow(ScannerContext scannerContext, Cell curRowCell) throws IOException {
6034       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
6035       Cell next;
6036       while ((next = this.storeHeap.peek()) != null &&
6037              CellUtil.matchingRow(next, curRowCell)) {
6038         this.storeHeap.next(MOCKED_LIST);
6039       }
6040       resetFilters();
6041
6042       // Calling the hook in CP which allows it to do a fast forward
6043       return this.region.getCoprocessorHost() == null
6044           || this.region.getCoprocessorHost()
6045               .postScannerFilterRow(this, curRowCell);
6046     }
6047
6048     protected boolean isStopRow(Cell currentRowCell) {
6049       return currentRowCell == null
6050           || (stopRow != null && comparator.compareRows(currentRowCell, stopRow, 0, stopRow
6051           .length) >= isScan);
6052     }
6053
6054     @Override
6055     public synchronized void close() {
6056       if (storeHeap != null) {
6057         storeHeap.close();
6058         storeHeap = null;
6059       }
6060       if (joinedHeap != null) {
6061         joinedHeap.close();
6062         joinedHeap = null;
6063       }
6064       // no need to synchronize here.
6065       scannerReadPoints.remove(this);
6066       this.filterClosed = true;
6067     }
6068
6069     KeyValueHeap getStoreHeapForTesting() {
6070       return storeHeap;
6071     }
6072
6073     @Override
6074     public synchronized boolean reseek(byte[] row) throws IOException {
6075       if (row == null) {
6076         throw new IllegalArgumentException("Row cannot be null.");
6077       }
6078       boolean result = false;
6079       startRegionOperation();
6080       KeyValue kv = KeyValueUtil.createFirstOnRow(row);
6081       try {
6082         // use request seek to make use of the lazy seek option. See HBASE-5520
6083         result = this.storeHeap.requestSeek(kv, true, true);
6084         if (this.joinedHeap != null) {
6085           result = this.joinedHeap.requestSeek(kv, true, true) || result;
6086         }
6087       } catch (FileNotFoundException e) {
6088         throw handleFileNotFound(e);
6089       } finally {
6090         closeRegionOperation();
6091       }
6092       return result;
6093     }
6094
6095     private IOException handleFileNotFound(FileNotFoundException fnfe) throws IOException {
6096       // tries to refresh the store files, otherwise shutdown the RS.
6097       // TODO: add support for abort() of a single region and trigger reassignment.
6098       try {
6099         region.refreshStoreFiles(true);
6100         return new IOException("unable to read store file");
6101       } catch (IOException e) {
6102         String msg = "a store file got lost: " + fnfe.getMessage();
6103         LOG.error("unable to refresh store files", e);
6104         abortRegionServer(msg);
6105         return new NotServingRegionException(
6106           getRegionInfo().getRegionNameAsString() + " is closing");
6107       }
6108     }
6109
6110     private void abortRegionServer(String msg) throws IOException {
6111       if (rsServices instanceof HRegionServer) {
6112         ((HRegionServer)rsServices).abort(msg);
6113       }
6114       throw new UnsupportedOperationException("not able to abort RS after: " + msg);
6115     }
6116
6117     @Override
6118     public void shipped() throws IOException {
6119       if (storeHeap != null) {
6120         storeHeap.shipped();
6121       }
6122       if (joinedHeap != null) {
6123         joinedHeap.shipped();
6124       }
6125     }
6126
6127     @Override
6128     public void run() throws IOException {
6129       // This is the RPC callback method executed. We do the close in of the scanner in this
6130       // callback
6131       this.close();
6132     }
6133   }
6134
6135   // Utility methods
6136   /**
6137    * A utility method to create new instances of HRegion based on the
6138    * {@link HConstants#REGION_IMPL} configuration property.
6139    * @param tableDir qualified path of directory where region should be located,
6140    * usually the table directory.
6141    * @param wal The WAL is the outbound log for any updates to the HRegion
6142    * The wal file is a logfile from the previous execution that's
6143    * custom-computed for this HRegion. The HRegionServer computes and sorts the
6144    * appropriate wal info for this HRegion. If there is a previous file
6145    * (implying that the HRegion has been written-to before), then read it from
6146    * the supplied path.
6147    * @param fs is the filesystem.
6148    * @param conf is global configuration settings.
6149    * @param regionInfo - HRegionInfo that describes the region
6150    * is new), then read them from the supplied path.
6151    * @param htd the table descriptor
6152    * @return the new instance
6153    */
6154   static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
6155       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
6156       RegionServerServices rsServices) {
6157     try {
6158       @SuppressWarnings("unchecked")
6159       Class<? extends HRegion> regionClass =
6160           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
6161
6162       Constructor<? extends HRegion> c =
6163           regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
6164               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
6165               RegionServerServices.class);
6166
6167       return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
6168     } catch (Throwable e) {
6169       // todo: what should I throw here?
6170       throw new IllegalStateException("Could not instantiate a region instance.", e);
6171     }
6172   }
6173
6174   /**
6175    * Convenience method creating new HRegions. Used by createTable.
6176    *
6177    * @param info Info for region to create.
6178    * @param rootDir Root directory for HBase instance
6179    * @param wal shared WAL
6180    * @param initialize - true to initialize the region
6181    * @return new HRegion
6182    * @throws IOException
6183    */
6184   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6185         final Configuration conf, final HTableDescriptor hTableDescriptor,
6186         final WAL wal, final boolean initialize)
6187   throws IOException {
6188     LOG.info("creating HRegion " + info.getTable().getNameAsString()
6189         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
6190         " Table name == " + info.getTable().getNameAsString());
6191     FileSystem fs = FileSystem.get(conf);
6192     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6193     HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
6194     HRegion region = HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, null);
6195     if (initialize) region.initialize(null);
6196     return region;
6197   }
6198
6199   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6200                                       final Configuration conf,
6201                                       final HTableDescriptor hTableDescriptor,
6202                                       final WAL wal)
6203     throws IOException {
6204     return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
6205   }
6206
6207
6208   /**
6209    * Open a Region.
6210    * @param info Info for region to be opened.
6211    * @param wal WAL for region to use. This method will call
6212    * WAL#setSequenceNumber(long) passing the result of the call to
6213    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6214    * up.  HRegionStore does this every time it opens a new region.
6215    * @return new HRegion
6216    *
6217    * @throws IOException
6218    */
6219   public static HRegion openHRegion(final HRegionInfo info,
6220       final HTableDescriptor htd, final WAL wal,
6221       final Configuration conf)
6222   throws IOException {
6223     return openHRegion(info, htd, wal, conf, null, null);
6224   }
6225
6226   /**
6227    * Open a Region.
6228    * @param info Info for region to be opened
6229    * @param htd the table descriptor
6230    * @param wal WAL for region to use. This method will call
6231    * WAL#setSequenceNumber(long) passing the result of the call to
6232    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6233    * up.  HRegionStore does this every time it opens a new region.
6234    * @param conf The Configuration object to use.
6235    * @param rsServices An interface we can request flushes against.
6236    * @param reporter An interface we can report progress against.
6237    * @return new HRegion
6238    *
6239    * @throws IOException
6240    */
6241   public static HRegion openHRegion(final HRegionInfo info,
6242     final HTableDescriptor htd, final WAL wal, final Configuration conf,
6243     final RegionServerServices rsServices,
6244     final CancelableProgressable reporter)
6245   throws IOException {
6246     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
6247   }
6248
6249   /**
6250    * Open a Region.
6251    * @param rootDir Root directory for HBase instance
6252    * @param info Info for region to be opened.
6253    * @param htd the table descriptor
6254    * @param wal WAL for region to use. This method will call
6255    * WAL#setSequenceNumber(long) passing the result of the call to
6256    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6257    * up.  HRegionStore does this every time it opens a new region.
6258    * @param conf The Configuration object to use.
6259    * @return new HRegion
6260    * @throws IOException
6261    */
6262   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
6263       final HTableDescriptor htd, final WAL wal, final Configuration conf)
6264   throws IOException {
6265     return openHRegion(rootDir, info, htd, wal, conf, null, null);
6266   }
6267
6268   /**
6269    * Open a Region.
6270    * @param rootDir Root directory for HBase instance
6271    * @param info Info for region to be opened.
6272    * @param htd the table descriptor
6273    * @param wal WAL for region to use. This method will call
6274    * WAL#setSequenceNumber(long) passing the result of the call to
6275    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6276    * up.  HRegionStore does this every time it opens a new region.
6277    * @param conf The Configuration object to use.
6278    * @param rsServices An interface we can request flushes against.
6279    * @param reporter An interface we can report progress against.
6280    * @return new HRegion
6281    * @throws IOException
6282    */
6283   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
6284       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6285       final RegionServerServices rsServices,
6286       final CancelableProgressable reporter)
6287   throws IOException {
6288     FileSystem fs = null;
6289     if (rsServices != null) {
6290       fs = rsServices.getFileSystem();
6291     }
6292     if (fs == null) {
6293       fs = FileSystem.get(conf);
6294     }
6295     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
6296   }
6297
6298   /**
6299    * Open a Region.
6300    * @param conf The Configuration object to use.
6301    * @param fs Filesystem to use
6302    * @param rootDir Root directory for HBase instance
6303    * @param info Info for region to be opened.
6304    * @param htd the table descriptor
6305    * @param wal WAL for region to use. This method will call
6306    * WAL#setSequenceNumber(long) passing the result of the call to
6307    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6308    * up.  HRegionStore does this every time it opens a new region.
6309    * @return new HRegion
6310    * @throws IOException
6311    */
6312   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6313       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal)
6314       throws IOException {
6315     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
6316   }
6317
6318   /**
6319    * Open a Region.
6320    * @param conf The Configuration object to use.
6321    * @param fs Filesystem to use
6322    * @param rootDir Root directory for HBase instance
6323    * @param info Info for region to be opened.
6324    * @param htd the table descriptor
6325    * @param wal WAL for region to use. This method will call
6326    * WAL#setSequenceNumber(long) passing the result of the call to
6327    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6328    * up.  HRegionStore does this every time it opens a new region.
6329    * @param rsServices An interface we can request flushes against.
6330    * @param reporter An interface we can report progress against.
6331    * @return new HRegion
6332    * @throws IOException
6333    */
6334   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6335       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal,
6336       final RegionServerServices rsServices, final CancelableProgressable reporter)
6337       throws IOException {
6338     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6339     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
6340   }
6341
6342   /**
6343    * Open a Region.
6344    * @param conf The Configuration object to use.
6345    * @param fs Filesystem to use
6346    * @param rootDir Root directory for HBase instance
6347    * @param info Info for region to be opened.
6348    * @param htd the table descriptor
6349    * @param wal WAL for region to use. This method will call
6350    * WAL#setSequenceNumber(long) passing the result of the call to
6351    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6352    * up.  HRegionStore does this every time it opens a new region.
6353    * @param rsServices An interface we can request flushes against.
6354    * @param reporter An interface we can report progress against.
6355    * @return new HRegion
6356    * @throws IOException
6357    */
6358   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6359       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd,
6360       final WAL wal, final RegionServerServices rsServices,
6361       final CancelableProgressable reporter)
6362       throws IOException {
6363     if (info == null) throw new NullPointerException("Passed region info is null");
6364     if (LOG.isDebugEnabled()) {
6365       LOG.debug("Opening region: " + info);
6366     }
6367     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6368     return r.openHRegion(reporter);
6369   }
6370
6371   @VisibleForTesting
6372   public NavigableMap<byte[], Integer> getReplicationScope() {
6373     return this.replicationScope;
6374   }
6375
6376   /**
6377    * Useful when reopening a closed region (normally for unit tests)
6378    * @param other original object
6379    * @param reporter An interface we can report progress against.
6380    * @return new HRegion
6381    * @throws IOException
6382    */
6383   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
6384       throws IOException {
6385     HRegionFileSystem regionFs = other.getRegionFileSystem();
6386     HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
6387         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
6388     return r.openHRegion(reporter);
6389   }
6390
6391   public static Region openHRegion(final Region other, final CancelableProgressable reporter)
6392         throws IOException {
6393     return openHRegion((HRegion)other, reporter);
6394   }
6395
6396   /**
6397    * Open HRegion.
6398    * Calls initialize and sets sequenceId.
6399    * @return Returns <code>this</code>
6400    * @throws IOException
6401    */
6402   protected HRegion openHRegion(final CancelableProgressable reporter)
6403   throws IOException {
6404     // Refuse to open the region if we are missing local compression support
6405     checkCompressionCodecs();
6406     // Refuse to open the region if encryption configuration is incorrect or
6407     // codec support is missing
6408     checkEncryption();
6409     // Refuse to open the region if a required class cannot be loaded
6410     checkClassLoading();
6411     this.openSeqNum = initialize(reporter);
6412     this.mvcc.advanceTo(openSeqNum);
6413     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
6414         && !recovering) {
6415       // Only write the region open event marker to WAL if (1) we are not read-only
6416       // (2) dist log replay is off or we are not recovering. In case region is
6417       // recovering, the open event will be written at setRecovering(false)
6418       writeRegionOpenMarker(wal, openSeqNum);
6419     }
6420     return this;
6421   }
6422
6423   public static void warmupHRegion(final HRegionInfo info,
6424       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6425       final RegionServerServices rsServices,
6426       final CancelableProgressable reporter)
6427       throws IOException {
6428
6429     if (info == null) throw new NullPointerException("Passed region info is null");
6430
6431     if (LOG.isDebugEnabled()) {
6432       LOG.debug("HRegion.Warming up region: " + info);
6433     }
6434
6435     Path rootDir = FSUtils.getRootDir(conf);
6436     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6437
6438     FileSystem fs = null;
6439     if (rsServices != null) {
6440       fs = rsServices.getFileSystem();
6441     }
6442     if (fs == null) {
6443       fs = FileSystem.get(conf);
6444     }
6445
6446     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null);
6447     r.initializeWarmup(reporter);
6448   }
6449
6450
6451   private void checkCompressionCodecs() throws IOException {
6452     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6453       CompressionTest.testCompression(fam.getCompressionType());
6454       CompressionTest.testCompression(fam.getCompactionCompressionType());
6455     }
6456   }
6457
6458   private void checkEncryption() throws IOException {
6459     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6460       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
6461     }
6462   }
6463
6464   private void checkClassLoading() throws IOException {
6465     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
6466     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
6467   }
6468
6469   /**
6470    * Create a daughter region from given a temp directory with the region data.
6471    * @param hri Spec. for daughter region to open.
6472    * @throws IOException
6473    */
6474   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
6475     // Move the files from the temporary .splits to the final /table/region directory
6476     fs.commitDaughterRegion(hri);
6477
6478     // Create the daughter HRegion instance
6479     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(), fs.getFileSystem(),
6480         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
6481     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
6482     r.filteredReadRequestsCount.set(this.getFilteredReadRequestsCount() / 2);
6483     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
6484     return r;
6485   }
6486
6487   /**
6488    * Create a merged region given a temp directory with the region data.
6489    * @param region_b another merging region
6490    * @return merged HRegion
6491    * @throws IOException
6492    */
6493   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
6494       final HRegion region_b) throws IOException {
6495     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(),
6496         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
6497         this.getTableDesc(), this.rsServices);
6498     r.readRequestsCount.set(this.getReadRequestsCount()
6499         + region_b.getReadRequestsCount());
6500     r.filteredReadRequestsCount.set(this.getFilteredReadRequestsCount()
6501       + region_b.getFilteredReadRequestsCount());
6502     r.writeRequestsCount.set(this.getWriteRequestsCount()
6503
6504         + region_b.getWriteRequestsCount());
6505     this.fs.commitMergedRegion(mergedRegionInfo);
6506     return r;
6507   }
6508
6509   /**
6510    * Inserts a new region's meta information into the passed
6511    * <code>meta</code> region. Used by the HMaster bootstrap code adding
6512    * new table to hbase:meta table.
6513    *
6514    * @param meta hbase:meta HRegion to be updated
6515    * @param r HRegion to add to <code>meta</code>
6516    *
6517    * @throws IOException
6518    */
6519   // TODO remove since only test and merge use this
6520   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
6521     meta.checkResources();
6522     // The row key is the region name
6523     byte[] row = r.getRegionInfo().getRegionName();
6524     final long now = EnvironmentEdgeManager.currentTime();
6525     final List<Cell> cells = new ArrayList<Cell>(2);
6526     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6527       HConstants.REGIONINFO_QUALIFIER, now,
6528       r.getRegionInfo().toByteArray()));
6529     // Set into the root table the version of the meta table.
6530     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6531       HConstants.META_VERSION_QUALIFIER, now,
6532       Bytes.toBytes(HConstants.META_VERSION)));
6533     meta.put(row, HConstants.CATALOG_FAMILY, cells);
6534   }
6535
6536   /**
6537    * Computes the Path of the HRegion
6538    *
6539    * @param tabledir qualified path for table
6540    * @param name ENCODED region name
6541    * @return Path of HRegion directory
6542    * @deprecated For tests only; to be removed.
6543    */
6544   @Deprecated
6545   public static Path getRegionDir(final Path tabledir, final String name) {
6546     return new Path(tabledir, name);
6547   }
6548
6549   /**
6550    * Computes the Path of the HRegion
6551    *
6552    * @param rootdir qualified path of HBase root directory
6553    * @param info HRegionInfo for the region
6554    * @return qualified path of region directory
6555    * @deprecated For tests only; to be removed.
6556    */
6557   @Deprecated
6558   @VisibleForTesting
6559   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
6560     return new Path(
6561       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
6562   }
6563
6564   /**
6565    * Determines if the specified row is within the row range specified by the
6566    * specified HRegionInfo
6567    *
6568    * @param info HRegionInfo that specifies the row range
6569    * @param row row to be checked
6570    * @return true if the row is within the range specified by the HRegionInfo
6571    */
6572   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
6573     return ((info.getStartKey().length == 0) ||
6574         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
6575         ((info.getEndKey().length == 0) ||
6576             (Bytes.compareTo(info.getEndKey(), row) > 0));
6577   }
6578
6579   public static boolean rowIsInRange(HRegionInfo info, final byte [] row, final int offset,
6580       final short length) {
6581     return ((info.getStartKey().length == 0) ||
6582         (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length,
6583           row, offset, length) <= 0)) &&
6584         ((info.getEndKey().length == 0) ||
6585           (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0));
6586   }
6587
6588   /**
6589    * Merge two HRegions.  The regions must be adjacent and must not overlap.
6590    *
6591    * @return new merged HRegion
6592    * @throws IOException
6593    */
6594   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
6595   throws IOException {
6596     HRegion a = srcA;
6597     HRegion b = srcB;
6598
6599     // Make sure that srcA comes first; important for key-ordering during
6600     // write of the merged file.
6601     if (srcA.getRegionInfo().getStartKey() == null) {
6602       if (srcB.getRegionInfo().getStartKey() == null) {
6603         throw new IOException("Cannot merge two regions with null start key");
6604       }
6605       // A's start key is null but B's isn't. Assume A comes before B
6606     } else if ((srcB.getRegionInfo().getStartKey() == null) ||
6607       (Bytes.compareTo(srcA.getRegionInfo().getStartKey(),
6608         srcB.getRegionInfo().getStartKey()) > 0)) {
6609       a = srcB;
6610       b = srcA;
6611     }
6612
6613     if (!(Bytes.compareTo(a.getRegionInfo().getEndKey(),
6614         b.getRegionInfo().getStartKey()) == 0)) {
6615       throw new IOException("Cannot merge non-adjacent regions");
6616     }
6617     return merge(a, b);
6618   }
6619
6620   /**
6621    * Merge two regions whether they are adjacent or not.
6622    *
6623    * @param a region a
6624    * @param b region b
6625    * @return new merged region
6626    * @throws IOException
6627    */
6628   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
6629     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
6630       throw new IOException("Regions do not belong to the same table");
6631     }
6632
6633     FileSystem fs = a.getRegionFileSystem().getFileSystem();
6634     // Make sure each region's cache is empty
6635     a.flush(true);
6636     b.flush(true);
6637
6638     // Compact each region so we only have one store file per family
6639     a.compact(true);
6640     if (LOG.isDebugEnabled()) {
6641       LOG.debug("Files for region: " + a);
6642       a.getRegionFileSystem().logFileSystemState(LOG);
6643     }
6644     b.compact(true);
6645     if (LOG.isDebugEnabled()) {
6646       LOG.debug("Files for region: " + b);
6647       b.getRegionFileSystem().logFileSystemState(LOG);
6648     }
6649
6650     RegionMergeTransactionImpl rmt = new RegionMergeTransactionImpl(a, b, true);
6651     if (!rmt.prepare(null)) {
6652       throw new IOException("Unable to merge regions " + a + " and " + b);
6653     }
6654     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
6655     LOG.info("starting merge of regions: " + a + " and " + b
6656         + " into new region " + mergedRegionInfo.getRegionNameAsString()
6657         + " with start key <"
6658         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
6659         + "> and end key <"
6660         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
6661     HRegion dstRegion;
6662     try {
6663       dstRegion = (HRegion)rmt.execute(null, null);
6664     } catch (IOException ioe) {
6665       rmt.rollback(null, null);
6666       throw new IOException("Failed merging region " + a + " and " + b
6667           + ", and successfully rolled back");
6668     }
6669     dstRegion.compact(true);
6670
6671     if (LOG.isDebugEnabled()) {
6672       LOG.debug("Files for new region");
6673       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
6674     }
6675
6676     // clear the compacted files if any
6677     for (Store s : dstRegion.getStores()) {
6678       s.closeAndArchiveCompactedFiles();
6679     }
6680     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
6681       throw new IOException("Merged region " + dstRegion
6682           + " still has references after the compaction, is compaction canceled?");
6683     }
6684
6685     // Archiving the 'A' region
6686     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
6687     // Archiving the 'B' region
6688     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
6689
6690     LOG.info("merge completed. New region is " + dstRegion);
6691     return dstRegion;
6692   }
6693
6694   @Override
6695   public Result get(final Get get) throws IOException {
6696     prepareGet(get);
6697     List<Cell> results = get(get, true);
6698     boolean stale = this.getRegionInfo().getReplicaId() != 0;
6699     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
6700   }
6701
6702    void prepareGet(final Get get) throws IOException, NoSuchColumnFamilyException {
6703     checkRow(get.getRow(), "Get");
6704     // Verify families are all valid
6705     if (get.hasFamilies()) {
6706       for (byte [] family: get.familySet()) {
6707         checkFamily(family);
6708       }
6709     } else { // Adding all families to scanner
6710       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
6711         get.addFamily(family);
6712       }
6713     }
6714   }
6715
6716   @Override
6717   public List<Cell> get(Get get, boolean withCoprocessor) throws IOException {
6718
6719     List<Cell> results = new ArrayList<Cell>();
6720
6721     // pre-get CP hook
6722     if (withCoprocessor && (coprocessorHost != null)) {
6723       if (coprocessorHost.preGet(get, results)) {
6724         return results;
6725       }
6726     }
6727     long before =  EnvironmentEdgeManager.currentTime();
6728     Scan scan = new Scan(get);
6729
6730     RegionScanner scanner = null;
6731     try {
6732       scanner = getScanner(scan);
6733       scanner.next(results);
6734     } finally {
6735       if (scanner != null)
6736         scanner.close();
6737     }
6738
6739     // post-get CP hook
6740     if (withCoprocessor && (coprocessorHost != null)) {
6741       coprocessorHost.postGet(get, results);
6742     }
6743
6744     metricsUpdateForGet(results, before);
6745
6746     return results;
6747   }
6748
6749   void metricsUpdateForGet(List<Cell> results, long before) {
6750     if (this.metricsRegion != null) {
6751       long totalSize = 0L;
6752       for (Cell cell : results) {
6753         // This should give an estimate of the cell in the result. Why do we need
6754         // to know the serialization of how the codec works with it??
6755         totalSize += CellUtil.estimatedSerializedSizeOf(cell);
6756       }
6757       this.metricsRegion.updateGetSize(totalSize);
6758       this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before);
6759     }
6760   }
6761
6762   @Override
6763   public void mutateRow(RowMutations rm) throws IOException {
6764     // Don't need nonces here - RowMutations only supports puts and deletes
6765     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
6766   }
6767
6768   /**
6769    * Perform atomic mutations within the region w/o nonces.
6770    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
6771    */
6772   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6773       Collection<byte[]> rowsToLock) throws IOException {
6774     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
6775   }
6776
6777   /**
6778    * Perform atomic mutations within the region.
6779    * @param mutations The list of mutations to perform.
6780    * <code>mutations</code> can contain operations for multiple rows.
6781    * Caller has to ensure that all rows are contained in this region.
6782    * @param rowsToLock Rows to lock
6783    * @param nonceGroup Optional nonce group of the operation (client Id)
6784    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6785    * If multiple rows are locked care should be taken that
6786    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
6787    * @throws IOException
6788    */
6789   @Override
6790   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6791       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
6792     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
6793     processRowsWithLocks(proc, -1, nonceGroup, nonce);
6794   }
6795
6796   /**
6797    * @return statistics about the current load of the region
6798    */
6799   public ClientProtos.RegionLoadStats getLoadStatistics() {
6800     if (!regionStatsEnabled) {
6801       return null;
6802     }
6803     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
6804     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
6805         .memstoreFlushSize)));
6806     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
6807     stats.setCompactionPressure((int)rsServices.getCompactionPressure()*100 > 100 ? 100 :
6808                 (int)rsServices.getCompactionPressure()*100);
6809     return stats.build();
6810   }
6811
6812   @Override
6813   public void processRowsWithLocks(RowProcessor<?,?> processor) throws IOException {
6814     processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE,
6815       HConstants.NO_NONCE);
6816   }
6817
6818   @Override
6819   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
6820       throws IOException {
6821     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
6822   }
6823
6824   @Override
6825   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
6826       long nonceGroup, long nonce) throws IOException {
6827     for (byte[] row : processor.getRowsToLock()) {
6828       checkRow(row, "processRowsWithLocks");
6829     }
6830     if (!processor.readOnly()) {
6831       checkReadOnly();
6832     }
6833     checkResources();
6834     startRegionOperation();
6835     WALEdit walEdit = new WALEdit();
6836
6837     // STEP 1. Run pre-process hook
6838     preProcess(processor, walEdit);
6839     // Short circuit the read only case
6840     if (processor.readOnly()) {
6841       try {
6842         long now = EnvironmentEdgeManager.currentTime();
6843         doProcessRowWithTimeout(processor, now, this, null, null, timeout);
6844         processor.postProcess(this, walEdit, true);
6845       } finally {
6846         closeRegionOperation();
6847       }
6848       return;
6849     }
6850
6851     boolean locked;
6852     List<RowLock> acquiredRowLocks;
6853     long addedSize = 0;
6854     List<Mutation> mutations = new ArrayList<Mutation>();
6855     Collection<byte[]> rowsToLock = processor.getRowsToLock();
6856     // This is assigned by mvcc either explicity in the below or in the guts of the WAL append
6857     // when it assigns the edit a sequencedid (A.K.A the mvcc write number).
6858     WriteEntry writeEntry = null;
6859     try {
6860       // STEP 2. Acquire the row lock(s)
6861       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
6862       for (byte[] row : rowsToLock) {
6863         // Attempt to lock all involved rows, throw if any lock times out
6864         // use a writer lock for mixed reads and writes
6865         acquiredRowLocks.add(getRowLockInternal(row, false));
6866       }
6867       // STEP 3. Region lock
6868       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
6869       locked = true;
6870       boolean success = false;
6871       long now = EnvironmentEdgeManager.currentTime();
6872       try {
6873         // STEP 4. Let the processor scan the rows, generate mutations and add waledits
6874         doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout);
6875         if (!mutations.isEmpty()) {
6876           // STEP 5. Call the preBatchMutate hook
6877           processor.preBatchMutate(this, walEdit);
6878
6879           // STEP 6. Append and sync if walEdit has data to write out.
6880           if (!walEdit.isEmpty()) {
6881             writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()),
6882                 processor.getClusterIds(), now, nonceGroup, nonce);
6883           } else {
6884             // We are here if WAL is being skipped.
6885             writeEntry = this.mvcc.begin();
6886           }
6887
6888           // STEP 7. Apply to memstore
6889           long sequenceId = writeEntry.getWriteNumber();
6890           for (Mutation m : mutations) {
6891             // Handle any tag based cell features.
6892             // TODO: Do we need to call rewriteCellTags down in applyToMemstore()? Why not before
6893             // so tags go into WAL?
6894             rewriteCellTags(m.getFamilyCellMap(), m);
6895             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
6896               Cell cell = cellScanner.current();
6897               if (walEdit.isEmpty()) {
6898                 // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id.
6899                 // If no WAL, need to stamp it here.
6900                 CellUtil.setSequenceId(cell, sequenceId);
6901               }
6902               Store store = getStore(cell);
6903               addedSize += applyToMemstore(store, cell, sequenceId);
6904             }
6905           }
6906           // STEP 8. Complete mvcc.
6907           mvcc.completeAndWait(writeEntry);
6908           writeEntry = null;
6909
6910           // STEP 9. Release region lock
6911           if (locked) {
6912             this.updatesLock.readLock().unlock();
6913             locked = false;
6914           }
6915
6916           // STEP 10. Release row lock(s)
6917           releaseRowLocks(acquiredRowLocks);
6918
6919           // STEP 11. call postBatchMutate hook
6920           processor.postBatchMutate(this);
6921         }
6922         success = true;
6923       } finally {
6924         // Call complete rather than completeAndWait because we probably had error if walKey != null
6925         if (writeEntry != null) mvcc.complete(writeEntry);
6926         if (locked) {
6927           this.updatesLock.readLock().unlock();
6928         }
6929         // release locks if some were acquired but another timed out
6930         releaseRowLocks(acquiredRowLocks);
6931       }
6932
6933       // 12. Run post-process hook
6934       processor.postProcess(this, walEdit, success);
6935     } finally {
6936       closeRegionOperation();
6937       if (!mutations.isEmpty()) {
6938         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
6939         requestFlushIfNeeded(newSize);
6940       }
6941     }
6942   }
6943
6944   private void preProcess(final RowProcessor<?,?> processor, final WALEdit walEdit)
6945   throws IOException {
6946     try {
6947       processor.preProcess(this, walEdit);
6948     } catch (IOException e) {
6949       closeRegionOperation();
6950       throw e;
6951     }
6952   }
6953
6954   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
6955                                        final long now,
6956                                        final HRegion region,
6957                                        final List<Mutation> mutations,
6958                                        final WALEdit walEdit,
6959                                        final long timeout) throws IOException {
6960     // Short circuit the no time bound case.
6961     if (timeout < 0) {
6962       try {
6963         processor.process(now, region, mutations, walEdit);
6964       } catch (IOException e) {
6965         LOG.warn("RowProcessor:" + processor.getClass().getName() +
6966             " throws Exception on row(s):" +
6967             Bytes.toStringBinary(
6968               processor.getRowsToLock().iterator().next()) + "...", e);
6969         throw e;
6970       }
6971       return;
6972     }
6973
6974     // Case with time bound
6975     FutureTask<Void> task =
6976       new FutureTask<Void>(new Callable<Void>() {
6977         @Override
6978         public Void call() throws IOException {
6979           try {
6980             processor.process(now, region, mutations, walEdit);
6981             return null;
6982           } catch (IOException e) {
6983             LOG.warn("RowProcessor:" + processor.getClass().getName() +
6984                 " throws Exception on row(s):" +
6985                 Bytes.toStringBinary(
6986                     processor.getRowsToLock().iterator().next()) + "...", e);
6987             throw e;
6988           }
6989         }
6990       });
6991     rowProcessorExecutor.execute(task);
6992     try {
6993       task.get(timeout, TimeUnit.MILLISECONDS);
6994     } catch (TimeoutException te) {
6995       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
6996           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
6997           "...");
6998       throw new IOException(te);
6999     } catch (Exception e) {
7000       throw new IOException(e);
7001     }
7002   }
7003
7004   public Result append(Append append) throws IOException {
7005     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
7006   }
7007
7008   @Override
7009   public Result append(Append mutation, long nonceGroup, long nonce) throws IOException {
7010     return doDelta(Operation.APPEND, mutation, nonceGroup, nonce, mutation.isReturnResults());
7011   }
7012
7013   public Result increment(Increment increment) throws IOException {
7014     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
7015   }
7016
7017   @Override
7018   public Result increment(Increment mutation, long nonceGroup, long nonce)
7019   throws IOException {
7020     return doDelta(Operation.INCREMENT, mutation, nonceGroup, nonce, mutation.isReturnResults());
7021   }
7022
7023   /**
7024    * Add "deltas" to Cells. Deltas are increments or appends. Switch on <code>op</code>.
7025    *
7026    * <p>If increment, add deltas to current values or if an append, then
7027    * append the deltas to the current Cell values.
7028    *
7029    * <p>Append and Increment code paths are mostly the same. They differ in just a few places.
7030    * This method does the code path for increment and append and then in key spots, switches
7031    * on the passed in <code>op</code> to do increment or append specific paths.
7032    */
7033   private Result doDelta(Operation op, Mutation mutation, long nonceGroup, long nonce,
7034       boolean returnResults)
7035   throws IOException {
7036     checkReadOnly();
7037     checkResources();
7038     checkRow(mutation.getRow(), op.toString());
7039     checkFamilies(mutation.getFamilyCellMap().keySet());
7040     this.writeRequestsCount.increment();
7041     WriteEntry writeEntry = null;
7042     startRegionOperation(op);
7043     long accumulatedResultSize = 0;
7044     List<Cell> results = returnResults? new ArrayList<Cell>(mutation.size()): null;
7045     RowLock rowLock = getRowLockInternal(mutation.getRow(), false);
7046     try {
7047       lock(this.updatesLock.readLock());
7048       try {
7049         Result cpResult = doCoprocessorPreCall(op, mutation);
7050         if (cpResult != null) {
7051           return returnResults? cpResult: null;
7052         }
7053         Durability effectiveDurability = getEffectiveDurability(mutation.getDurability());
7054         Map<Store, List<Cell>> forMemStore =
7055             new HashMap<Store, List<Cell>>(mutation.getFamilyCellMap().size());
7056         // Reckon Cells to apply to WAL --  in returned walEdit -- and what to add to memstore and
7057         // what to return back to the client (in 'forMemStore' and 'results' respectively).
7058         WALEdit walEdit = reckonDeltas(op, mutation, effectiveDurability, forMemStore, results);
7059         // Actually write to WAL now if a walEdit to apply.
7060         if (walEdit != null && !walEdit.isEmpty()) {
7061           writeEntry = doWALAppend(walEdit, durability, nonceGroup, nonce);
7062         } else {
7063           // If walEdits is empty, it means we skipped the WAL; update counters and start an mvcc
7064           // transaction.
7065           recordMutationWithoutWal(mutation.getFamilyCellMap());
7066           writeEntry = mvcc.begin();
7067         }
7068         // Now write to MemStore. Do it a column family at a time.
7069         long sequenceId = writeEntry.getWriteNumber();
7070         for (Map.Entry<Store, List<Cell>> e: forMemStore.entrySet()) {
7071           accumulatedResultSize +=
7072               applyToMemstore(e.getKey(), e.getValue(), true, false, sequenceId);
7073         }
7074         mvcc.completeAndWait(writeEntry);
7075         writeEntry = null;
7076       } finally {
7077         this.updatesLock.readLock().unlock();
7078       }
7079       // If results is null, then client asked that we not return the calculated results.
7080       return results != null && returnResults? Result.create(results): null;
7081     } finally {
7082       // Call complete always, even on success. doDelta is doing a Get READ_UNCOMMITTED when it goes
7083       // to get current value under an exclusive lock so no need so no need to wait to return to
7084       // the client. Means only way to read-your-own-increment or append is to come in with an
7085       // a 0 increment.
7086       if (writeEntry != null) mvcc.complete(writeEntry);
7087       rowLock.release();
7088       // Request a cache flush if over the limit.  Do it outside update lock.
7089       if (isFlushSize(this.addAndGetGlobalMemstoreSize(accumulatedResultSize))) requestFlush();
7090       closeRegionOperation(op);
7091       if (this.metricsRegion != null) {
7092         switch (op) {
7093           case INCREMENT:
7094             this.metricsRegion.updateIncrement();
7095             break;
7096           case APPEND:
7097             this.metricsRegion.updateAppend();
7098             break;
7099           default:
7100             break;
7101         }
7102       }
7103     }
7104   }
7105
7106   private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, long nonceGroup,
7107       long nonce)
7108   throws IOException {
7109     return doWALAppend(walEdit, durability, WALKey.EMPTY_UUIDS, System.currentTimeMillis(),
7110       nonceGroup, nonce);
7111   }
7112
7113   /**
7114    * @return writeEntry associated with this append
7115    */
7116   private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds,
7117       long now, long nonceGroup, long nonce)
7118   throws IOException {
7119     WriteEntry writeEntry = null;
7120     // Using default cluster id, as this can only happen in the originating cluster.
7121     // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey
7122     // here instead of WALKey directly to support legacy coprocessors.
7123     WALKey walKey = new WALKey(this.getRegionInfo().getEncodedNameAsBytes(),
7124       this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now, clusterIds,
7125       nonceGroup, nonce, mvcc, this.getReplicationScope());
7126     try {
7127       long txid =
7128         this.wal.append(this.getRegionInfo(), walKey, walEdit, true);
7129       // Call sync on our edit.
7130       if (txid != 0) sync(txid, durability);
7131       writeEntry = walKey.getWriteEntry();
7132     } catch (IOException ioe) {
7133       if (walKey != null) mvcc.complete(walKey.getWriteEntry());
7134       throw ioe;
7135     }
7136     return writeEntry;
7137   }
7138
7139   /**
7140    * Do coprocessor pre-increment or pre-append call.
7141    * @return Result returned out of the coprocessor, which means bypass all further processing and
7142    *  return the proffered Result instead, or null which means proceed.
7143    */
7144   private Result doCoprocessorPreCall(final Operation op, final Mutation mutation)
7145   throws IOException {
7146     Result result = null;
7147     if (this.coprocessorHost != null) {
7148       switch(op) {
7149         case INCREMENT:
7150           result = this.coprocessorHost.preIncrementAfterRowLock((Increment)mutation);
7151           break;
7152         case APPEND:
7153           result = this.coprocessorHost.preAppendAfterRowLock((Append)mutation);
7154           break;
7155         default: throw new UnsupportedOperationException(op.toString());
7156       }
7157     }
7158     return result;
7159   }
7160
7161   /**
7162    * Reckon the Cells to apply to WAL, memstore, and to return to the Client; these Sets are not
7163    * always the same dependent on whether to write WAL or if the amount to increment is zero (in
7164    * this case we write back nothing, just return latest Cell value to the client).
7165    *
7166    * @param results Fill in here what goes back to the Client if it is non-null (if null, client
7167    *  doesn't want results).
7168    * @param forMemStore Fill in here what to apply to the MemStore (by Store).
7169    * @return A WALEdit to apply to WAL or null if we are to skip the WAL.
7170    */
7171   private WALEdit reckonDeltas(final Operation op, final Mutation mutation,
7172       final Durability effectiveDurability, final Map<Store, List<Cell>> forMemStore,
7173       final List<Cell> results)
7174   throws IOException {
7175     WALEdit walEdit = null;
7176     long now = EnvironmentEdgeManager.currentTime();
7177     final boolean writeToWAL = effectiveDurability != Durability.SKIP_WAL;
7178     // Process a Store/family at a time.
7179     for (Map.Entry<byte [], List<Cell>> entry: mutation.getFamilyCellMap().entrySet()) {
7180       final byte [] columnFamilyName = entry.getKey();
7181       List<Cell> deltas = entry.getValue();
7182       Store store = this.stores.get(columnFamilyName);
7183       // Reckon for the Store what to apply to WAL and MemStore.
7184       List<Cell> toApply =
7185         reckonDeltasByStore(store, op, mutation, effectiveDurability, now, deltas, results);
7186       if (!toApply.isEmpty()) {
7187         forMemStore.put(store, toApply);
7188         if (writeToWAL) {
7189           if (walEdit == null) {
7190             walEdit = new WALEdit();
7191           }
7192           walEdit.getCells().addAll(toApply);
7193         }
7194       }
7195     }
7196     return walEdit;
7197   }
7198
7199   /**
7200    * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed
7201    * column family/Store.
7202    *
7203    * Does Get of current value and then adds passed in deltas for this Store returning the result.
7204    *
7205    * @param op Whether Increment or Append
7206    * @param mutation The encompassing Mutation object
7207    * @param deltas Changes to apply to this Store; either increment amount or data to append
7208    * @param results In here we accumulate all the Cells we are to return to the client; this List
7209    *  can be larger than what we return in case where delta is zero; i.e. don't write
7210    *  out new values, just return current value. If null, client doesn't want results returned.
7211    * @return Resulting Cells after <code>deltas</code> have been applied to current
7212    *  values. Side effect is our filling out of the <code>results</code> List.
7213    */
7214   private List<Cell> reckonDeltasByStore(final Store store, final Operation op,
7215       final Mutation mutation, final Durability effectiveDurability, final long now,
7216       final List<Cell> deltas, final List<Cell> results)
7217   throws IOException {
7218     byte [] columnFamily = store.getFamily().getName();
7219     List<Cell> toApply = new ArrayList<Cell>(deltas.size());
7220     // Get previous values for all columns in this family.
7221     List<Cell> currentValues = get(mutation, store, deltas,
7222         null/*Default IsolationLevel*/,
7223         op == Operation.INCREMENT? ((Increment)mutation).getTimeRange(): null);
7224     // Iterate the input columns and update existing values if they were found, otherwise
7225     // add new column initialized to the delta amount
7226     int currentValuesIndex = 0;
7227     for (int i = 0; i < deltas.size(); i++) {
7228       Cell delta = deltas.get(i);
7229       Cell currentValue = null;
7230       if (currentValuesIndex < currentValues.size() &&
7231           CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta)) {
7232         currentValue = currentValues.get(currentValuesIndex);
7233         if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) {
7234           currentValuesIndex++;
7235         }
7236       }
7237       // Switch on whether this an increment or an append building the new Cell to apply.
7238       Cell newCell = null;
7239       MutationType mutationType = null;
7240       boolean apply = true;
7241       switch (op) {
7242         case INCREMENT:
7243           mutationType = MutationType.INCREMENT;
7244           // If delta amount to apply is 0, don't write WAL or MemStore.
7245           long deltaAmount = getLongValue(delta);
7246           apply = deltaAmount != 0;
7247           newCell = reckonIncrement(delta, deltaAmount, currentValue, columnFamily, now,
7248             (Increment)mutation);
7249           break;
7250         case APPEND:
7251           mutationType = MutationType.APPEND;
7252           // Always apply Append. TODO: Does empty delta value mean reset Cell? It seems to.
7253           newCell = reckonAppend(delta, currentValue, now, (Append)mutation);
7254           break;
7255         default: throw new UnsupportedOperationException(op.toString());
7256       }
7257
7258       // Give coprocessors a chance to update the new cell
7259       if (coprocessorHost != null) {
7260         newCell =
7261             coprocessorHost.postMutationBeforeWAL(mutationType, mutation, currentValue, newCell);
7262       }
7263       // If apply, we need to update memstore/WAL with new value; add it toApply.
7264       if (apply) {
7265         toApply.add(newCell);
7266       }
7267       // Add to results to get returned to the Client. If null, cilent does not want results.
7268       if (results != null) {
7269         results.add(newCell);
7270       }
7271     }
7272     return toApply;
7273   }
7274
7275   /**
7276    * Calculate new Increment Cell.
7277    * @return New Increment Cell with delta applied to currentValue if currentValue is not null;
7278    *  otherwise, a new Cell with the delta set as its value.
7279    */
7280   private Cell reckonIncrement(final Cell delta, final long deltaAmount, final Cell currentValue,
7281       byte [] columnFamily, final long now, Mutation mutation)
7282   throws IOException {
7283     // Forward any tags found on the delta.
7284     List<Tag> tags = TagUtil.carryForwardTags(delta);
7285     long newValue = deltaAmount;
7286     long ts = now;
7287     if (currentValue != null) {
7288       tags = TagUtil.carryForwardTags(tags, currentValue);
7289       ts = Math.max(now, currentValue.getTimestamp());
7290       newValue += getLongValue(currentValue);
7291     }
7292     // Now make up the new Cell. TODO: FIX. This is carnel knowledge of how KeyValues are made...
7293     // doesn't work well with offheaping or if we are doing a different Cell type.
7294     byte [] incrementAmountInBytes = Bytes.toBytes(newValue);
7295     tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
7296     byte [] row = mutation.getRow();
7297     return new KeyValue(row, 0, row.length,
7298       columnFamily, 0, columnFamily.length,
7299       delta.getQualifierArray(), delta.getQualifierOffset(), delta.getQualifierLength(),
7300       ts, KeyValue.Type.Put,
7301       incrementAmountInBytes, 0, incrementAmountInBytes.length,
7302       tags);
7303   }
7304
7305   private Cell reckonAppend(final Cell delta, final Cell currentValue, final long now,
7306       Append mutation)
7307   throws IOException {
7308     // Forward any tags found on the delta.
7309     List<Tag> tags = TagUtil.carryForwardTags(delta);
7310     long ts = now;
7311     Cell newCell = null;
7312     byte [] row = mutation.getRow();
7313     if (currentValue != null) {
7314       tags = TagUtil.carryForwardTags(tags, currentValue);
7315       ts = Math.max(now, currentValue.getTimestamp());
7316       tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
7317       byte[] tagBytes = TagUtil.fromList(tags);
7318       // Allocate an empty cell and copy in all parts.
7319       // TODO: This is intimate knowledge of how a KeyValue is made. Undo!!! Prevents our doing
7320       // other Cell types. Copying on-heap too if an off-heap Cell.
7321       newCell = new KeyValue(row.length, delta.getFamilyLength(),
7322         delta.getQualifierLength(), ts, KeyValue.Type.Put,
7323         delta.getValueLength() + currentValue.getValueLength(),
7324         tagBytes == null? 0: tagBytes.length);
7325       // Copy in row, family, and qualifier
7326       System.arraycopy(row, 0, newCell.getRowArray(), newCell.getRowOffset(), row.length);
7327       System.arraycopy(delta.getFamilyArray(), delta.getFamilyOffset(),
7328           newCell.getFamilyArray(), newCell.getFamilyOffset(), delta.getFamilyLength());
7329       System.arraycopy(delta.getQualifierArray(), delta.getQualifierOffset(),
7330           newCell.getQualifierArray(), newCell.getQualifierOffset(), delta.getQualifierLength());
7331       // Copy in the value
7332       CellUtil.copyValueTo(currentValue, newCell.getValueArray(), newCell.getValueOffset());
7333       System.arraycopy(delta.getValueArray(), delta.getValueOffset(),
7334           newCell.getValueArray(), newCell.getValueOffset() + currentValue.getValueLength(),
7335           delta.getValueLength());
7336       // Copy in tag data
7337       if (tagBytes != null) {
7338         System.arraycopy(tagBytes, 0,
7339             newCell.getTagsArray(), newCell.getTagsOffset(), tagBytes.length);
7340       }
7341     } else {
7342       // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP
7343       CellUtil.updateLatestStamp(delta, now);
7344       newCell = delta;
7345       tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL());
7346       if (tags != null) {
7347         newCell = new TagRewriteCell(delta, TagUtil.fromList(tags));
7348       }
7349     }
7350     return newCell;
7351   }
7352
7353   /**
7354    * @return Get the long out of the passed in Cell
7355    */
7356   private static long getLongValue(final Cell cell) throws DoNotRetryIOException {
7357     int len = cell.getValueLength();
7358     if (len != Bytes.SIZEOF_LONG) {
7359       // throw DoNotRetryIOException instead of IllegalArgumentException
7360       throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide");
7361     }
7362     return Bytes.toLong(cell.getValueArray(), cell.getValueOffset(), len);
7363   }
7364
7365   /**
7366    * Do a specific Get on passed <code>columnFamily</code> and column qualifiers.
7367    * @param mutation Mutation we are doing this Get for.
7368    * @param store Which column family on row (TODO: Go all Gets in one go)
7369    * @param coordinates Cells from <code>mutation</code> used as coordinates applied to Get.
7370    * @return Return list of Cells found.
7371    */
7372   private List<Cell> get(final Mutation mutation, final Store store,
7373           final List<Cell> coordinates, final IsolationLevel isolation, final TimeRange tr)
7374   throws IOException {
7375     // Sort the cells so that they match the order that they appear in the Get results. Otherwise,
7376     // we won't be able to find the existing values if the cells are not specified in order by the
7377     // client since cells are in an array list.
7378     // TODO: I don't get why we are sorting. St.Ack 20150107
7379     sort(coordinates, store.getComparator());
7380     Get get = new Get(mutation.getRow());
7381     if (isolation != null) {
7382       get.setIsolationLevel(isolation);
7383     }
7384     for (Cell cell: coordinates) {
7385       get.addColumn(store.getFamily().getName(), CellUtil.cloneQualifier(cell));
7386     }
7387     // Increments carry time range. If an Increment instance, put it on the Get.
7388     if (tr != null) {
7389       get.setTimeRange(tr.getMin(), tr.getMax());
7390     }
7391     return get(get, false);
7392   }
7393
7394   /**
7395    * @return Sorted list of <code>cells</code> using <code>comparator</code>
7396    */
7397   private static List<Cell> sort(List<Cell> cells, final Comparator<Cell> comparator) {
7398     Collections.sort(cells, comparator);
7399     return cells;
7400   }
7401
7402   //
7403   // New HBASE-880 Helpers
7404   //
7405
7406   void checkFamily(final byte [] family)
7407   throws NoSuchColumnFamilyException {
7408     if (!this.htableDescriptor.hasFamily(family)) {
7409       throw new NoSuchColumnFamilyException("Column family " +
7410           Bytes.toString(family) + " does not exist in region " + this
7411           + " in table " + this.htableDescriptor);
7412     }
7413   }
7414
7415   public static final long FIXED_OVERHEAD = ClassSize.align(
7416       ClassSize.OBJECT +
7417       ClassSize.ARRAY +
7418       47 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
7419       (14 * Bytes.SIZEOF_LONG) +
7420       5 * Bytes.SIZEOF_BOOLEAN);
7421
7422   // woefully out of date - currently missing:
7423   // 1 x HashMap - coprocessorServiceHandlers
7424   // 6 x Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
7425   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
7426   //   writeRequestsCount
7427   // 1 x HRegion$WriteState - writestate
7428   // 1 x RegionCoprocessorHost - coprocessorHost
7429   // 1 x RegionSplitPolicy - splitPolicy
7430   // 1 x MetricsRegion - metricsRegion
7431   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
7432   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
7433       ClassSize.OBJECT + // closeLock
7434       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
7435       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
7436       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
7437       WriteState.HEAP_SIZE + // writestate
7438       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
7439       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
7440       MultiVersionConcurrencyControl.FIXED_SIZE // mvcc
7441       + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes
7442       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
7443       + ClassSize.STORE_SERVICES // store services
7444       ;
7445
7446   @Override
7447   public long heapSize() {
7448     long heapSize = DEEP_OVERHEAD;
7449     for (Store store : this.stores.values()) {
7450       heapSize += store.heapSize();
7451     }
7452     // this does not take into account row locks, recent flushes, mvcc entries, and more
7453     return heapSize;
7454   }
7455
7456   @Override
7457   public boolean registerService(Service instance) {
7458     /*
7459      * No stacking of instances is allowed for a single service name
7460      */
7461     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
7462     String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc);
7463     if (coprocessorServiceHandlers.containsKey(serviceName)) {
7464       LOG.error("Coprocessor service " + serviceName +
7465               " already registered, rejecting request from " + instance
7466       );
7467       return false;
7468     }
7469
7470     coprocessorServiceHandlers.put(serviceName, instance);
7471     if (LOG.isDebugEnabled()) {
7472       LOG.debug("Registered coprocessor service: region=" +
7473           Bytes.toStringBinary(getRegionInfo().getRegionName()) +
7474           " service=" + serviceName);
7475     }
7476     return true;
7477   }
7478
7479   @Override
7480   public Message execService(RpcController controller, CoprocessorServiceCall call)
7481       throws IOException {
7482     String serviceName = call.getServiceName();
7483     String methodName = call.getMethodName();
7484     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
7485       throw new UnknownProtocolException(null,
7486           "No registered coprocessor service found for name "+serviceName+
7487           " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7488     }
7489
7490     Service service = coprocessorServiceHandlers.get(serviceName);
7491     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
7492     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
7493     if (methodDesc == null) {
7494       throw new UnknownProtocolException(service.getClass(),
7495           "Unknown method "+methodName+" called on service "+serviceName+
7496               " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7497     }
7498
7499     Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType();
7500     ProtobufUtil.mergeFrom(builder, call.getRequest());
7501     Message request = builder.build();
7502
7503     if (coprocessorHost != null) {
7504       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
7505     }
7506
7507     final Message.Builder responseBuilder =
7508         service.getResponsePrototype(methodDesc).newBuilderForType();
7509     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
7510       @Override
7511       public void run(Message message) {
7512         if (message != null) {
7513           responseBuilder.mergeFrom(message);
7514         }
7515       }
7516     });
7517
7518     if (coprocessorHost != null) {
7519       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
7520     }
7521
7522     IOException exception = ResponseConverter.getControllerException(controller);
7523     if (exception != null) {
7524       throw exception;
7525     }
7526
7527     return responseBuilder.build();
7528   }
7529
7530   boolean shouldForceSplit() {
7531     return this.splitRequest;
7532   }
7533
7534   byte[] getExplicitSplitPoint() {
7535     return this.explicitSplitPoint;
7536   }
7537
7538   void forceSplit(byte[] sp) {
7539     // This HRegion will go away after the forced split is successful
7540     // But if a forced split fails, we need to clear forced split.
7541     this.splitRequest = true;
7542     if (sp != null) {
7543       this.explicitSplitPoint = sp;
7544     }
7545   }
7546
7547   void clearSplit() {
7548     this.splitRequest = false;
7549     this.explicitSplitPoint = null;
7550   }
7551
7552   /**
7553    * Give the region a chance to prepare before it is split.
7554    */
7555   protected void prepareToSplit() {
7556     // nothing
7557   }
7558
7559   /**
7560    * Return the splitpoint. null indicates the region isn't splittable
7561    * If the splitpoint isn't explicitly specified, it will go over the stores
7562    * to find the best splitpoint. Currently the criteria of best splitpoint
7563    * is based on the size of the store.
7564    */
7565   public byte[] checkSplit() {
7566     // Can't split META
7567     if (this.getRegionInfo().isMetaTable() ||
7568         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
7569       if (shouldForceSplit()) {
7570         LOG.warn("Cannot split meta region in HBase 0.20 and above");
7571       }
7572       return null;
7573     }
7574
7575     // Can't split region which is in recovering state
7576     if (this.isRecovering()) {
7577       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
7578       return null;
7579     }
7580
7581     if (!splitPolicy.shouldSplit()) {
7582       return null;
7583     }
7584
7585     byte[] ret = splitPolicy.getSplitPoint();
7586
7587     if (ret != null) {
7588       try {
7589         checkRow(ret, "calculated split");
7590       } catch (IOException e) {
7591         LOG.error("Ignoring invalid split", e);
7592         return null;
7593       }
7594     }
7595     return ret;
7596   }
7597
7598   /**
7599    * @return The priority that this region should have in the compaction queue
7600    */
7601   public int getCompactPriority() {
7602     int count = Integer.MAX_VALUE;
7603     for (Store store : stores.values()) {
7604       count = Math.min(count, store.getCompactPriority());
7605     }
7606     return count;
7607   }
7608
7609
7610   /** @return the coprocessor host */
7611   @Override
7612   public RegionCoprocessorHost getCoprocessorHost() {
7613     return coprocessorHost;
7614   }
7615
7616   /** @param coprocessorHost the new coprocessor host */
7617   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
7618     this.coprocessorHost = coprocessorHost;
7619   }
7620
7621   @Override
7622   public void startRegionOperation() throws IOException {
7623     startRegionOperation(Operation.ANY);
7624   }
7625
7626   @Override
7627   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="SF_SWITCH_FALLTHROUGH",
7628     justification="Intentional")
7629   public void startRegionOperation(Operation op) throws IOException {
7630     switch (op) {
7631       case GET:  // read operations
7632       case SCAN:
7633         checkReadsEnabled();
7634       case INCREMENT: // write operations
7635       case APPEND:
7636       case SPLIT_REGION:
7637       case MERGE_REGION:
7638       case PUT:
7639       case DELETE:
7640       case BATCH_MUTATE:
7641       case COMPACT_REGION:
7642         // when a region is in recovering state, no read, split or merge is allowed
7643         if (isRecovering() && (this.disallowWritesInRecovering ||
7644               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
7645           throw new RegionInRecoveryException(getRegionInfo().getRegionNameAsString() +
7646             " is recovering; cannot take reads");
7647         }
7648         break;
7649       default:
7650         break;
7651     }
7652     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
7653         || op == Operation.COMPACT_REGION) {
7654       // split, merge or compact region doesn't need to check the closing/closed state or lock the
7655       // region
7656       return;
7657     }
7658     if (this.closing.get()) {
7659       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
7660     }
7661     lock(lock.readLock());
7662     if (this.closed.get()) {
7663       lock.readLock().unlock();
7664       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
7665     }
7666     try {
7667       if (coprocessorHost != null) {
7668         coprocessorHost.postStartRegionOperation(op);
7669       }
7670     } catch (Exception e) {
7671       lock.readLock().unlock();
7672       throw new IOException(e);
7673     }
7674   }
7675
7676   @Override
7677   public void closeRegionOperation() throws IOException {
7678     closeRegionOperation(Operation.ANY);
7679   }
7680
7681   /**
7682    * Closes the lock. This needs to be called in the finally block corresponding
7683    * to the try block of {@link #startRegionOperation(Operation)}
7684    * @throws IOException
7685    */
7686   public void closeRegionOperation(Operation operation) throws IOException {
7687     lock.readLock().unlock();
7688     if (coprocessorHost != null) {
7689       coprocessorHost.postCloseRegionOperation(operation);
7690     }
7691   }
7692
7693   /**
7694    * This method needs to be called before any public call that reads or
7695    * modifies stores in bulk. It has to be called just before a try.
7696    * #closeBulkRegionOperation needs to be called in the try's finally block