001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL; 021import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY; 022import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.REGION_NAMES_KEY; 023import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.ROW_LOCK_READ_LOCK_KEY; 024import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent; 025 026import com.google.errorprone.annotations.RestrictedApi; 027import edu.umd.cs.findbugs.annotations.Nullable; 028import io.opentelemetry.api.trace.Span; 029import java.io.EOFException; 030import java.io.FileNotFoundException; 031import java.io.IOException; 032import java.io.InterruptedIOException; 033import java.lang.reflect.Constructor; 034import java.nio.ByteBuffer; 035import java.nio.charset.StandardCharsets; 036import java.text.ParseException; 037import java.util.ArrayList; 038import java.util.Arrays; 039import java.util.Collection; 040import java.util.Collections; 041import java.util.HashMap; 042import java.util.HashSet; 043import java.util.Iterator; 044import java.util.List; 045import java.util.Map; 046import java.util.Map.Entry; 047import java.util.NavigableMap; 048import java.util.NavigableSet; 049import java.util.Objects; 050import java.util.Optional; 051import java.util.RandomAccess; 052import java.util.Set; 053import java.util.TreeMap; 054import java.util.UUID; 055import java.util.concurrent.Callable; 056import java.util.concurrent.CompletionService; 057import java.util.concurrent.ConcurrentHashMap; 058import java.util.concurrent.ConcurrentMap; 059import java.util.concurrent.ConcurrentSkipListMap; 060import java.util.concurrent.ExecutionException; 061import java.util.concurrent.ExecutorCompletionService; 062import java.util.concurrent.Future; 063import java.util.concurrent.ThreadFactory; 064import java.util.concurrent.ThreadPoolExecutor; 065import java.util.concurrent.TimeUnit; 066import java.util.concurrent.atomic.AtomicBoolean; 067import java.util.concurrent.atomic.AtomicInteger; 068import java.util.concurrent.atomic.LongAdder; 069import java.util.concurrent.locks.Lock; 070import java.util.concurrent.locks.ReadWriteLock; 071import java.util.concurrent.locks.ReentrantLock; 072import java.util.concurrent.locks.ReentrantReadWriteLock; 073import java.util.function.Function; 074import java.util.stream.Collectors; 075import java.util.stream.Stream; 076import org.apache.hadoop.conf.Configuration; 077import org.apache.hadoop.fs.FileStatus; 078import org.apache.hadoop.fs.FileSystem; 079import org.apache.hadoop.fs.LocatedFileStatus; 080import org.apache.hadoop.fs.Path; 081import org.apache.hadoop.hbase.Cell; 082import org.apache.hadoop.hbase.CellBuilderType; 083import org.apache.hadoop.hbase.CellComparator; 084import org.apache.hadoop.hbase.CellComparatorImpl; 085import org.apache.hadoop.hbase.CellScanner; 086import org.apache.hadoop.hbase.CellUtil; 087import org.apache.hadoop.hbase.CompareOperator; 088import org.apache.hadoop.hbase.CompoundConfiguration; 089import org.apache.hadoop.hbase.DoNotRetryIOException; 090import org.apache.hadoop.hbase.DroppedSnapshotException; 091import org.apache.hadoop.hbase.ExtendedCellBuilderFactory; 092import org.apache.hadoop.hbase.HConstants; 093import org.apache.hadoop.hbase.HConstants.OperationStatusCode; 094import org.apache.hadoop.hbase.HDFSBlocksDistribution; 095import org.apache.hadoop.hbase.KeyValue; 096import org.apache.hadoop.hbase.MetaCellComparator; 097import org.apache.hadoop.hbase.NamespaceDescriptor; 098import org.apache.hadoop.hbase.NotServingRegionException; 099import org.apache.hadoop.hbase.PrivateCellUtil; 100import org.apache.hadoop.hbase.RegionTooBusyException; 101import org.apache.hadoop.hbase.Tag; 102import org.apache.hadoop.hbase.TagUtil; 103import org.apache.hadoop.hbase.client.Append; 104import org.apache.hadoop.hbase.client.CheckAndMutate; 105import org.apache.hadoop.hbase.client.CheckAndMutateResult; 106import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; 107import org.apache.hadoop.hbase.client.CompactionState; 108import org.apache.hadoop.hbase.client.Delete; 109import org.apache.hadoop.hbase.client.Durability; 110import org.apache.hadoop.hbase.client.Get; 111import org.apache.hadoop.hbase.client.Increment; 112import org.apache.hadoop.hbase.client.IsolationLevel; 113import org.apache.hadoop.hbase.client.Mutation; 114import org.apache.hadoop.hbase.client.Put; 115import org.apache.hadoop.hbase.client.RegionInfo; 116import org.apache.hadoop.hbase.client.RegionReplicaUtil; 117import org.apache.hadoop.hbase.client.Result; 118import org.apache.hadoop.hbase.client.Row; 119import org.apache.hadoop.hbase.client.RowMutations; 120import org.apache.hadoop.hbase.client.Scan; 121import org.apache.hadoop.hbase.client.TableDescriptor; 122import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 123import org.apache.hadoop.hbase.conf.ConfigurationManager; 124import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver; 125import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; 126import org.apache.hadoop.hbase.coprocessor.ReadOnlyConfiguration; 127import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; 128import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException; 129import org.apache.hadoop.hbase.exceptions.TimeoutIOException; 130import org.apache.hadoop.hbase.exceptions.UnknownProtocolException; 131import org.apache.hadoop.hbase.filter.BinaryComparator; 132import org.apache.hadoop.hbase.filter.ByteArrayComparable; 133import org.apache.hadoop.hbase.filter.Filter; 134import org.apache.hadoop.hbase.io.HFileLink; 135import org.apache.hadoop.hbase.io.HeapSize; 136import org.apache.hadoop.hbase.io.TimeRange; 137import org.apache.hadoop.hbase.io.hfile.BlockCache; 138import org.apache.hadoop.hbase.io.hfile.CombinedBlockCache; 139import org.apache.hadoop.hbase.io.hfile.HFile; 140import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache; 141import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils; 142import org.apache.hadoop.hbase.ipc.RpcCall; 143import org.apache.hadoop.hbase.ipc.RpcServer; 144import org.apache.hadoop.hbase.ipc.ServerCall; 145import org.apache.hadoop.hbase.mob.MobFileCache; 146import org.apache.hadoop.hbase.monitoring.MonitoredTask; 147import org.apache.hadoop.hbase.monitoring.TaskMonitor; 148import org.apache.hadoop.hbase.quotas.RegionServerSpaceQuotaManager; 149import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry; 150import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext; 151import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker; 152import org.apache.hadoop.hbase.regionserver.compactions.ForbidMajorCompactionChecker; 153import org.apache.hadoop.hbase.regionserver.metrics.MetricsTableRequests; 154import org.apache.hadoop.hbase.regionserver.regionreplication.RegionReplicationSink; 155import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory; 156import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController; 157import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector; 158import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController; 159import org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException; 160import org.apache.hadoop.hbase.regionserver.wal.WALUtil; 161import org.apache.hadoop.hbase.replication.ReplicationUtils; 162import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver; 163import org.apache.hadoop.hbase.security.User; 164import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; 165import org.apache.hadoop.hbase.snapshot.SnapshotManifest; 166import org.apache.hadoop.hbase.trace.TraceUtil; 167import org.apache.hadoop.hbase.util.Bytes; 168import org.apache.hadoop.hbase.util.CancelableProgressable; 169import org.apache.hadoop.hbase.util.ClassSize; 170import org.apache.hadoop.hbase.util.CommonFSUtils; 171import org.apache.hadoop.hbase.util.CoprocessorConfigurationUtil; 172import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 173import org.apache.hadoop.hbase.util.FSUtils; 174import org.apache.hadoop.hbase.util.HashedBytes; 175import org.apache.hadoop.hbase.util.NonceKey; 176import org.apache.hadoop.hbase.util.Pair; 177import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; 178import org.apache.hadoop.hbase.util.TableDescriptorChecker; 179import org.apache.hadoop.hbase.util.Threads; 180import org.apache.hadoop.hbase.wal.WAL; 181import org.apache.hadoop.hbase.wal.WALEdit; 182import org.apache.hadoop.hbase.wal.WALFactory; 183import org.apache.hadoop.hbase.wal.WALKey; 184import org.apache.hadoop.hbase.wal.WALKeyImpl; 185import org.apache.hadoop.hbase.wal.WALSplitUtil; 186import org.apache.hadoop.hbase.wal.WALSplitUtil.MutationReplay; 187import org.apache.hadoop.hbase.wal.WALStreamReader; 188import org.apache.hadoop.util.StringUtils; 189import org.apache.yetus.audience.InterfaceAudience; 190import org.slf4j.Logger; 191import org.slf4j.LoggerFactory; 192 193import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 194import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; 195import org.apache.hbase.thirdparty.com.google.common.collect.Lists; 196import org.apache.hbase.thirdparty.com.google.common.collect.Maps; 197import org.apache.hbase.thirdparty.com.google.common.io.Closeables; 198import org.apache.hbase.thirdparty.com.google.protobuf.Descriptors.MethodDescriptor; 199import org.apache.hbase.thirdparty.com.google.protobuf.Descriptors.ServiceDescriptor; 200import org.apache.hbase.thirdparty.com.google.protobuf.Message; 201import org.apache.hbase.thirdparty.com.google.protobuf.RpcCallback; 202import org.apache.hbase.thirdparty.com.google.protobuf.RpcController; 203import org.apache.hbase.thirdparty.com.google.protobuf.Service; 204import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat; 205import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations; 206import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils; 207 208import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 209import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.WALEntry; 210import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos; 211import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall; 212import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad; 213import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId; 214import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; 215import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos; 216import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor; 217import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor; 218import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction; 219import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor; 220import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor; 221import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType; 222import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor; 223 224/** 225 * Regions store data for a certain region of a table. It stores all columns for each row. A given 226 * table consists of one or more Regions. 227 * <p> 228 * An Region is defined by its table and its key extent. 229 * <p> 230 * Locking at the Region level serves only one purpose: preventing the region from being closed (and 231 * consequently split) while other operations are ongoing. Each row level operation obtains both a 232 * row lock and a region read lock for the duration of the operation. While a scanner is being 233 * constructed, getScanner holds a read lock. If the scanner is successfully constructed, it holds a 234 * read lock until it is closed. A close takes out a write lock and consequently will block for 235 * ongoing operations and will block new operations from starting while the close is in progress. 236 */ 237@SuppressWarnings("deprecation") 238@InterfaceAudience.Private 239public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region { 240 private static final Logger LOG = LoggerFactory.getLogger(HRegion.class); 241 242 public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY = 243 "hbase.hregion.scan.loadColumnFamiliesOnDemand"; 244 245 public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize"; 246 public static final int DEFAULT_MAX_CELL_SIZE = 10485760; 247 248 public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE = 249 "hbase.regionserver.minibatch.size"; 250 public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000; 251 252 public static final String WAL_HSYNC_CONF_KEY = "hbase.wal.hsync"; 253 public static final boolean DEFAULT_WAL_HSYNC = false; 254 255 /** Parameter name for compaction after bulkload */ 256 public static final String COMPACTION_AFTER_BULKLOAD_ENABLE = 257 "hbase.compaction.after.bulkload.enable"; 258 259 /** Config for allow split when file count greater than the configured blocking file count */ 260 public static final String SPLIT_IGNORE_BLOCKING_ENABLED_KEY = 261 "hbase.hregion.split.ignore.blocking.enabled"; 262 263 public static final String REGION_STORAGE_POLICY_KEY = "hbase.hregion.block.storage.policy"; 264 public static final String DEFAULT_REGION_STORAGE_POLICY = "NONE"; 265 266 /** 267 * This is for for using HRegion as a local storage, where we may put the recovered edits in a 268 * special place. Once this is set, we will only replay the recovered edits under this directory 269 * and ignore the original replay directory configs. 270 */ 271 public static final String SPECIAL_RECOVERED_EDITS_DIR = 272 "hbase.hregion.special.recovered.edits.dir"; 273 274 /** 275 * Mainly used for master local region, where we will replay the WAL file directly without 276 * splitting, so it is possible to have WAL files which are not closed cleanly, in this way, 277 * hitting EOF is expected so should not consider it as a critical problem. 278 */ 279 public static final String RECOVERED_EDITS_IGNORE_EOF = 280 "hbase.hregion.recovered.edits.ignore.eof"; 281 282 /** 283 * Whether to use {@link MetaCellComparator} even if we are not meta region. Used when creating 284 * master local region. 285 */ 286 public static final String USE_META_CELL_COMPARATOR = "hbase.region.use.meta.cell.comparator"; 287 288 public static final boolean DEFAULT_USE_META_CELL_COMPARATOR = false; 289 290 final AtomicBoolean closed = new AtomicBoolean(false); 291 292 /* 293 * Closing can take some time; use the closing flag if there is stuff we don't want to do while in 294 * closing state; e.g. like offer this region up to the master as a region to close if the 295 * carrying regionserver is overloaded. Once set, it is never cleared. 296 */ 297 final AtomicBoolean closing = new AtomicBoolean(false); 298 299 /** 300 * The max sequence id of flushed data on this region. There is no edit in memory that is less 301 * that this sequence id. 302 */ 303 private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM; 304 305 /** 306 * Record the sequence id of last flush operation. Can be in advance of {@link #maxFlushedSeqId} 307 * when flushing a single column family. In this case, {@link #maxFlushedSeqId} will be older than 308 * the oldest edit in memory. 309 */ 310 private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM; 311 312 /** 313 * The sequence id of the last replayed open region event from the primary region. This is used to 314 * skip entries before this due to the possibility of replay edits coming out of order from 315 * replication. 316 */ 317 protected volatile long lastReplayedOpenRegionSeqId = -1L; 318 protected volatile long lastReplayedCompactionSeqId = -1L; 319 320 ////////////////////////////////////////////////////////////////////////////// 321 // Members 322 ////////////////////////////////////////////////////////////////////////////// 323 324 // map from a locked row to the context for that lock including: 325 // - CountDownLatch for threads waiting on that row 326 // - the thread that owns the lock (allow reentrancy) 327 // - reference count of (reentrant) locks held by the thread 328 // - the row itself 329 private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows = 330 new ConcurrentHashMap<>(); 331 332 protected final Map<byte[], HStore> stores = 333 new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR); 334 335 // TODO: account for each registered handler in HeapSize computation 336 private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap(); 337 338 // Track data size in all memstores 339 private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing(); 340 RegionServicesForStores regionServicesForStores; 341 342 // Debug possible data loss due to WAL off 343 final LongAdder numMutationsWithoutWAL = new LongAdder(); 344 final LongAdder dataInMemoryWithoutWAL = new LongAdder(); 345 346 // Debug why CAS operations are taking a while. 347 final LongAdder checkAndMutateChecksPassed = new LongAdder(); 348 final LongAdder checkAndMutateChecksFailed = new LongAdder(); 349 350 // Number of requests 351 // Count rows for scan 352 final LongAdder readRequestsCount = new LongAdder(); 353 final LongAdder cpRequestsCount = new LongAdder(); 354 final LongAdder filteredReadRequestsCount = new LongAdder(); 355 // Count rows for multi row mutations 356 final LongAdder writeRequestsCount = new LongAdder(); 357 358 // Number of requests blocked by memstore size. 359 private final LongAdder blockedRequestsCount = new LongAdder(); 360 361 // Compaction LongAdders 362 final LongAdder compactionsFinished = new LongAdder(); 363 final LongAdder compactionsFailed = new LongAdder(); 364 final LongAdder compactionNumFilesCompacted = new LongAdder(); 365 final LongAdder compactionNumBytesCompacted = new LongAdder(); 366 final LongAdder compactionsQueued = new LongAdder(); 367 final LongAdder flushesQueued = new LongAdder(); 368 369 private BlockCache blockCache; 370 private MobFileCache mobFileCache; 371 private final WAL wal; 372 private final HRegionFileSystem fs; 373 protected final Configuration conf; 374 private final Configuration baseConf; 375 private final int rowLockWaitDuration; 376 static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000; 377 378 private Path regionWalDir; 379 private FileSystem walFS; 380 381 // set to true if the region is restored from snapshot for reading by ClientSideRegionScanner 382 private boolean isRestoredRegion = false; 383 384 public void setRestoredRegion(boolean restoredRegion) { 385 isRestoredRegion = restoredRegion; 386 } 387 388 public MetricsTableRequests getMetricsTableRequests() { 389 return metricsTableRequests; 390 } 391 392 // Handle table latency metrics 393 private MetricsTableRequests metricsTableRequests; 394 395 // The internal wait duration to acquire a lock before read/update 396 // from the region. It is not per row. The purpose of this wait time 397 // is to avoid waiting a long time while the region is busy, so that 398 // we can release the IPC handler soon enough to improve the 399 // availability of the region server. It can be adjusted by 400 // tuning configuration "hbase.busy.wait.duration". 401 final long busyWaitDuration; 402 static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT; 403 404 // If updating multiple rows in one call, wait longer, 405 // i.e. waiting for busyWaitDuration * # of rows. However, 406 // we can limit the max multiplier. 407 final int maxBusyWaitMultiplier; 408 409 // Max busy wait duration. There is no point to wait longer than the RPC 410 // purge timeout, when a RPC call will be terminated by the RPC engine. 411 final long maxBusyWaitDuration; 412 413 // Max cell size. If nonzero, the maximum allowed size for any given cell 414 // in bytes 415 final long maxCellSize; 416 417 // Number of mutations for minibatch processing. 418 private final int miniBatchSize; 419 420 final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints; 421 final ReadPointCalculationLock smallestReadPointCalcLock; 422 423 /** 424 * The sequence ID that was enLongAddered when this region was opened. 425 */ 426 private long openSeqNum = HConstants.NO_SEQNUM; 427 428 /** 429 * The default setting for whether to enable on-demand CF loading for scan requests to this 430 * region. Requests can override it. 431 */ 432 private boolean isLoadingCfsOnDemandDefault = false; 433 434 private final AtomicInteger majorInProgress = new AtomicInteger(0); 435 private final AtomicInteger minorInProgress = new AtomicInteger(0); 436 437 // 438 // Context: During replay we want to ensure that we do not lose any data. So, we 439 // have to be conservative in how we replay wals. For each store, we calculate 440 // the maxSeqId up to which the store was flushed. And, skip the edits which 441 // are equal to or lower than maxSeqId for each store. 442 // The following map is populated when opening the region 443 Map<byte[], Long> maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR); 444 445 // lock used to protect the replay operation for secondary replicas, so the below two fields does 446 // not need to be volatile. 447 private Lock replayLock; 448 449 /** Saved state from replaying prepare flush cache */ 450 private PrepareFlushResult prepareFlushResult = null; 451 452 private long lastReplayedSequenceId = HConstants.NO_SEQNUM; 453 454 private volatile ConfigurationManager configurationManager; 455 456 // Used for testing. 457 private volatile Long timeoutForWriteLock = null; 458 459 private final CellComparator cellComparator; 460 461 private final int minBlockSizeBytes; 462 463 /** 464 * @return The smallest mvcc readPoint across all the scanners in this region. Writes older than 465 * this readPoint, are included in every read operation. 466 */ 467 public long getSmallestReadPoint() { 468 // We need to ensure that while we are calculating the smallestReadPoint 469 // no new RegionScanners can grab a readPoint that we are unaware of. 470 smallestReadPointCalcLock.lock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); 471 try { 472 long minimumReadPoint = mvcc.getReadPoint(); 473 for (Long readPoint : this.scannerReadPoints.values()) { 474 minimumReadPoint = Math.min(minimumReadPoint, readPoint); 475 } 476 return minimumReadPoint; 477 } finally { 478 smallestReadPointCalcLock.unlock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); 479 } 480 } 481 482 /* 483 * Data structure of write state flags used coordinating flushes, compactions and closes. 484 */ 485 static class WriteState { 486 // Set while a memstore flush is happening. 487 volatile boolean flushing = false; 488 // Set when a flush has been requested. 489 volatile boolean flushRequested = false; 490 // Number of compactions running. 491 AtomicInteger compacting = new AtomicInteger(0); 492 // Gets set in close. If set, cannot compact or flush again. 493 volatile boolean writesEnabled = true; 494 // Set if region is read-only 495 volatile boolean readOnly = false; 496 // whether the reads are enabled. This is different than readOnly, because readOnly is 497 // static in the lifetime of the region, while readsEnabled is dynamic 498 volatile boolean readsEnabled = true; 499 500 /** 501 * Set flags that make this region read-only. 502 * @param onOff flip value for region r/o setting 503 */ 504 synchronized void setReadOnly(final boolean onOff) { 505 this.writesEnabled = !onOff; 506 this.readOnly = onOff; 507 } 508 509 boolean isReadOnly() { 510 return this.readOnly; 511 } 512 513 boolean isFlushRequested() { 514 return this.flushRequested; 515 } 516 517 void setReadsEnabled(boolean readsEnabled) { 518 this.readsEnabled = readsEnabled; 519 } 520 521 static final long HEAP_SIZE = ClassSize.align(ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN); 522 } 523 524 /** 525 * Objects from this class are created when flushing to describe all the different states that 526 * that method ends up in. The Result enum describes those states. The sequence id should only be 527 * specified if the flush was successful, and the failure message should only be specified if it 528 * didn't flush. 529 */ 530 public static class FlushResultImpl implements FlushResult { 531 final Result result; 532 final String failureReason; 533 final long flushSequenceId; 534 final boolean wroteFlushWalMarker; 535 536 /** 537 * Convenience constructor to use when the flush is successful, the failure message is set to 538 * null. 539 * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED. 540 * @param flushSequenceId Generated sequence id that comes right after the edits in the 541 * memstores. 542 */ 543 FlushResultImpl(Result result, long flushSequenceId) { 544 this(result, flushSequenceId, null, false); 545 assert result == Result.FLUSHED_NO_COMPACTION_NEEDED 546 || result == Result.FLUSHED_COMPACTION_NEEDED; 547 } 548 549 /** 550 * Convenience constructor to use when we cannot flush. 551 * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH. 552 * @param failureReason Reason why we couldn't flush. 553 */ 554 FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) { 555 this(result, -1, failureReason, wroteFlushMarker); 556 assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH; 557 } 558 559 /** 560 * Constructor with all the parameters. 561 * @param result Any of the Result. 562 * @param flushSequenceId Generated sequence id if the memstores were flushed else -1. 563 * @param failureReason Reason why we couldn't flush, or null. 564 */ 565 FlushResultImpl(Result result, long flushSequenceId, String failureReason, 566 boolean wroteFlushMarker) { 567 this.result = result; 568 this.flushSequenceId = flushSequenceId; 569 this.failureReason = failureReason; 570 this.wroteFlushWalMarker = wroteFlushMarker; 571 } 572 573 /** 574 * Convenience method, the equivalent of checking if result is FLUSHED_NO_COMPACTION_NEEDED or 575 * FLUSHED_NO_COMPACTION_NEEDED. 576 * @return true if the memstores were flushed, else false. 577 */ 578 @Override 579 public boolean isFlushSucceeded() { 580 return result == Result.FLUSHED_NO_COMPACTION_NEEDED 581 || result == Result.FLUSHED_COMPACTION_NEEDED; 582 } 583 584 /** 585 * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED. 586 * @return True if the flush requested a compaction, else false (doesn't even mean it flushed). 587 */ 588 @Override 589 public boolean isCompactionNeeded() { 590 return result == Result.FLUSHED_COMPACTION_NEEDED; 591 } 592 593 @Override 594 public String toString() { 595 return new StringBuilder().append("flush result:").append(result).append(", ") 596 .append("failureReason:").append(failureReason).append(",").append("flush seq id") 597 .append(flushSequenceId).toString(); 598 } 599 600 @Override 601 public Result getResult() { 602 return result; 603 } 604 } 605 606 /** A result object from prepare flush cache stage */ 607 protected static class PrepareFlushResult { 608 final FlushResultImpl result; // indicating a failure result from prepare 609 final TreeMap<byte[], StoreFlushContext> storeFlushCtxs; 610 final TreeMap<byte[], List<Path>> committedFiles; 611 final TreeMap<byte[], MemStoreSize> storeFlushableSize; 612 final long startTime; 613 final long flushOpSeqId; 614 final long flushedSeqId; 615 final MemStoreSizing totalFlushableSize; 616 617 /** Constructs an early exit case */ 618 PrepareFlushResult(FlushResultImpl result, long flushSeqId) { 619 this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD); 620 } 621 622 /** Constructs a successful prepare flush result */ 623 PrepareFlushResult(TreeMap<byte[], StoreFlushContext> storeFlushCtxs, 624 TreeMap<byte[], List<Path>> committedFiles, TreeMap<byte[], MemStoreSize> storeFlushableSize, 625 long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) { 626 this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushSeqId, 627 flushedSeqId, totalFlushableSize); 628 } 629 630 private PrepareFlushResult(FlushResultImpl result, 631 TreeMap<byte[], StoreFlushContext> storeFlushCtxs, TreeMap<byte[], List<Path>> committedFiles, 632 TreeMap<byte[], MemStoreSize> storeFlushableSize, long startTime, long flushSeqId, 633 long flushedSeqId, MemStoreSizing totalFlushableSize) { 634 this.result = result; 635 this.storeFlushCtxs = storeFlushCtxs; 636 this.committedFiles = committedFiles; 637 this.storeFlushableSize = storeFlushableSize; 638 this.startTime = startTime; 639 this.flushOpSeqId = flushSeqId; 640 this.flushedSeqId = flushedSeqId; 641 this.totalFlushableSize = totalFlushableSize; 642 } 643 644 public FlushResult getResult() { 645 return this.result; 646 } 647 } 648 649 /** 650 * A class that tracks exceptions that have been observed in one batch. Not thread safe. 651 */ 652 static class ObservedExceptionsInBatch { 653 private boolean wrongRegion = false; 654 private boolean failedSanityCheck = false; 655 private boolean wrongFamily = false; 656 657 /** Returns If a {@link WrongRegionException} has been observed. */ 658 boolean hasSeenWrongRegion() { 659 return wrongRegion; 660 } 661 662 /** 663 * Records that a {@link WrongRegionException} has been observed. 664 */ 665 void sawWrongRegion() { 666 wrongRegion = true; 667 } 668 669 /** Returns If a {@link FailedSanityCheckException} has been observed. */ 670 boolean hasSeenFailedSanityCheck() { 671 return failedSanityCheck; 672 } 673 674 /** 675 * Records that a {@link FailedSanityCheckException} has been observed. 676 */ 677 void sawFailedSanityCheck() { 678 failedSanityCheck = true; 679 } 680 681 /** Returns If a {@link NoSuchColumnFamilyException} has been observed. */ 682 boolean hasSeenNoSuchFamily() { 683 return wrongFamily; 684 } 685 686 /** 687 * Records that a {@link NoSuchColumnFamilyException} has been observed. 688 */ 689 void sawNoSuchFamily() { 690 wrongFamily = true; 691 } 692 } 693 694 final WriteState writestate = new WriteState(); 695 696 long memstoreFlushSize; 697 final long timestampSlop; 698 699 // Last flush time for each Store. Useful when we are flushing for each column 700 private final ConcurrentMap<HStore, Long> lastStoreFlushTimeMap = new ConcurrentHashMap<>(); 701 702 protected RegionServerServices rsServices; 703 private RegionServerAccounting rsAccounting; 704 private long flushCheckInterval; 705 // flushPerChanges is to prevent too many changes in memstore 706 private long flushPerChanges; 707 private long blockingMemStoreSize; 708 // Used to guard closes 709 final ReentrantReadWriteLock lock; 710 // Used to track interruptible holders of the region lock. Currently that is only RPC handler 711 // threads. Boolean value in map determines if lock holder can be interrupted, normally true, 712 // but may be false when thread is transiting a critical section. 713 final ConcurrentHashMap<Thread, Boolean> regionLockHolders; 714 715 // Stop updates lock 716 private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock(); 717 718 private final MultiVersionConcurrencyControl mvcc; 719 720 // Coprocessor host 721 private volatile RegionCoprocessorHost coprocessorHost; 722 723 private TableDescriptor htableDescriptor = null; 724 private RegionSplitPolicy splitPolicy; 725 private RegionSplitRestriction splitRestriction; 726 private FlushPolicy flushPolicy; 727 728 private final MetricsRegion metricsRegion; 729 private final MetricsRegionWrapperImpl metricsRegionWrapper; 730 private final Durability regionDurability; 731 private final boolean regionStatsEnabled; 732 // Stores the replication scope of the various column families of the table 733 // that has non-default scope 734 private final NavigableMap<byte[], Integer> replicationScope = 735 new TreeMap<>(Bytes.BYTES_COMPARATOR); 736 737 private final StoreHotnessProtector storeHotnessProtector; 738 739 protected Optional<RegionReplicationSink> regionReplicationSink = Optional.empty(); 740 741 /** 742 * HRegion constructor. This constructor should only be used for testing and extensions. Instances 743 * of HRegion should be instantiated with the {@link HRegion#createHRegion} or 744 * {@link HRegion#openHRegion} method. 745 * @param tableDir qualified path of directory where region should be located, usually the table 746 * directory. 747 * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a 748 * logfile from the previous execution that's custom-computed for this HRegion. 749 * The HRegionServer computes and sorts the appropriate wal info for this 750 * HRegion. If there is a previous wal file (implying that the HRegion has been 751 * written-to before), then read it from the supplied path. 752 * @param fs is the filesystem. 753 * @param confParam is global configuration settings. 754 * @param regionInfo - RegionInfo that describes the region is new), then read them from the 755 * supplied path. 756 * @param htd the table descriptor 757 * @param rsServices reference to {@link RegionServerServices} or null 758 * @deprecated Use other constructors. 759 */ 760 @Deprecated 761 public HRegion(final Path tableDir, final WAL wal, final FileSystem fs, 762 final Configuration confParam, final RegionInfo regionInfo, final TableDescriptor htd, 763 final RegionServerServices rsServices) { 764 this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo), wal, confParam, htd, 765 rsServices); 766 } 767 768 /** 769 * HRegion constructor. This constructor should only be used for testing and extensions. Instances 770 * of HRegion should be instantiated with the {@link HRegion#createHRegion} or 771 * {@link HRegion#openHRegion} method. 772 * @param fs is the filesystem. 773 * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a 774 * logfile from the previous execution that's custom-computed for this HRegion. 775 * The HRegionServer computes and sorts the appropriate wal info for this 776 * HRegion. If there is a previous wal file (implying that the HRegion has been 777 * written-to before), then read it from the supplied path. 778 * @param confParam is global configuration settings. 779 * @param htd the table descriptor 780 * @param rsServices reference to {@link RegionServerServices} or null 781 */ 782 public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam, 783 final TableDescriptor htd, final RegionServerServices rsServices) { 784 if (htd == null) { 785 throw new IllegalArgumentException("Need table descriptor"); 786 } 787 788 if (confParam instanceof CompoundConfiguration) { 789 throw new IllegalArgumentException("Need original base configuration"); 790 } 791 792 this.wal = wal; 793 this.fs = fs; 794 this.mvcc = new MultiVersionConcurrencyControl(getRegionInfo().getShortNameToLog()); 795 796 // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor 797 this.baseConf = confParam; 798 this.conf = new CompoundConfiguration().add(confParam).addBytesMap(htd.getValues()); 799 this.cellComparator = htd.isMetaTable() 800 || conf.getBoolean(USE_META_CELL_COMPARATOR, DEFAULT_USE_META_CELL_COMPARATOR) 801 ? MetaCellComparator.META_COMPARATOR 802 : CellComparatorImpl.COMPARATOR; 803 this.lock = new ReentrantReadWriteLock( 804 conf.getBoolean(FAIR_REENTRANT_CLOSE_LOCK, DEFAULT_FAIR_REENTRANT_CLOSE_LOCK)); 805 this.regionLockHolders = new ConcurrentHashMap<>(); 806 this.flushCheckInterval = 807 conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL, DEFAULT_CACHE_FLUSH_INTERVAL); 808 this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES); 809 if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) { 810 throw new IllegalArgumentException( 811 MEMSTORE_FLUSH_PER_CHANGES + " can not exceed " + MAX_FLUSH_PER_CHANGES); 812 } 813 int tmpRowLockDuration = 814 conf.getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION); 815 if (tmpRowLockDuration <= 0) { 816 LOG.info("Found hbase.rowlock.wait.duration set to {}. values <= 0 will cause all row " 817 + "locking to fail. Treating it as 1ms to avoid region failure.", tmpRowLockDuration); 818 tmpRowLockDuration = 1; 819 } 820 this.rowLockWaitDuration = tmpRowLockDuration; 821 822 this.smallestReadPointCalcLock = new ReadPointCalculationLock(conf); 823 824 this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true); 825 this.htableDescriptor = htd; 826 Set<byte[]> families = this.htableDescriptor.getColumnFamilyNames(); 827 for (byte[] family : families) { 828 if (!replicationScope.containsKey(family)) { 829 int scope = htd.getColumnFamily(family).getScope(); 830 // Only store those families that has NON-DEFAULT scope 831 if (scope != REPLICATION_SCOPE_LOCAL) { 832 // Do a copy before storing it here. 833 replicationScope.put(Bytes.copy(family), scope); 834 } 835 } 836 } 837 838 this.rsServices = rsServices; 839 if (this.rsServices != null) { 840 this.blockCache = rsServices.getBlockCache().orElse(null); 841 this.mobFileCache = rsServices.getMobFileCache().orElse(null); 842 } 843 this.regionServicesForStores = new RegionServicesForStores(this, rsServices); 844 845 setHTableSpecificConf(); 846 this.scannerReadPoints = new ConcurrentHashMap<>(); 847 848 this.busyWaitDuration = conf.getLong("hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION); 849 this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2); 850 if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) { 851 throw new IllegalArgumentException("Invalid hbase.busy.wait.duration (" + busyWaitDuration 852 + ") or hbase.busy.wait.multiplier.max (" + maxBusyWaitMultiplier 853 + "). Their product should be positive"); 854 } 855 this.maxBusyWaitDuration = 856 conf.getLong("hbase.ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT); 857 858 /* 859 * timestamp.slop provides a server-side constraint on the timestamp. This assumes that you base 860 * your TS around EnvironmentEdgeManager.currentTime(). In this case, throw an error to the user 861 * if the user-specified TS is newer than now + slop. LATEST_TIMESTAMP == don't use this 862 * functionality 863 */ 864 this.timestampSlop = 865 conf.getLong("hbase.hregion.keyvalue.timestamp.slop.millisecs", HConstants.LATEST_TIMESTAMP); 866 867 this.storeHotnessProtector = new StoreHotnessProtector(this, conf); 868 869 boolean forceSync = conf.getBoolean(WAL_HSYNC_CONF_KEY, DEFAULT_WAL_HSYNC); 870 /** 871 * This is the global default value for durability. All tables/mutations not defining a 872 * durability or using USE_DEFAULT will default to this value. 873 */ 874 Durability defaultDurability = forceSync ? Durability.FSYNC_WAL : Durability.SYNC_WAL; 875 this.regionDurability = this.htableDescriptor.getDurability() == Durability.USE_DEFAULT 876 ? defaultDurability 877 : this.htableDescriptor.getDurability(); 878 879 decorateRegionConfiguration(conf); 880 if (rsServices != null) { 881 this.rsAccounting = this.rsServices.getRegionServerAccounting(); 882 // don't initialize coprocessors if not running within a regionserver 883 // TODO: revisit if coprocessors should load in other cases 884 this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); 885 this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this); 886 this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper, conf); 887 } else { 888 this.metricsRegionWrapper = null; 889 this.metricsRegion = null; 890 } 891 if (LOG.isDebugEnabled()) { 892 // Write out region name, its encoded name and storeHotnessProtector as string. 893 LOG.debug("Instantiated " + this + "; " + storeHotnessProtector.toString()); 894 } 895 896 configurationManager = null; 897 898 // disable stats tracking system tables, but check the config for everything else 899 this.regionStatsEnabled = htd.getTableName().getNamespaceAsString() 900 .equals(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) 901 ? false 902 : conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE, 903 HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE); 904 905 this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE); 906 this.miniBatchSize = 907 conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE, DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE); 908 909 // recover the metrics of read and write requests count if they were retained 910 if (rsServices != null && rsServices.getRegionServerAccounting() != null) { 911 Pair<Long, Long> retainedRWRequestsCnt = rsServices.getRegionServerAccounting() 912 .getRetainedRegionRWRequestsCnt().get(getRegionInfo().getEncodedName()); 913 if (retainedRWRequestsCnt != null) { 914 this.addReadRequestsCount(retainedRWRequestsCnt.getFirst()); 915 this.addWriteRequestsCount(retainedRWRequestsCnt.getSecond()); 916 // remove them since won't use again 917 rsServices.getRegionServerAccounting().getRetainedRegionRWRequestsCnt() 918 .remove(getRegionInfo().getEncodedName()); 919 } 920 } 921 922 minBlockSizeBytes = Arrays.stream(this.htableDescriptor.getColumnFamilies()) 923 .mapToInt(ColumnFamilyDescriptor::getBlocksize).min().orElse(HConstants.DEFAULT_BLOCKSIZE); 924 } 925 926 private void setHTableSpecificConf() { 927 if (this.htableDescriptor == null) { 928 return; 929 } 930 long flushSize = this.htableDescriptor.getMemStoreFlushSize(); 931 932 if (flushSize <= 0) { 933 flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, 934 TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE); 935 } 936 this.memstoreFlushSize = flushSize; 937 long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER, 938 HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER); 939 this.blockingMemStoreSize = this.memstoreFlushSize * mult; 940 } 941 942 /** 943 * Initialize this region. Used only by tests and SplitTransaction to reopen the region. You 944 * should use createHRegion() or openHRegion() 945 * @return What the next sequence (edit) id should be. 946 * @throws IOException e 947 * @deprecated use HRegion.createHRegion() or HRegion.openHRegion() 948 */ 949 @Deprecated 950 public long initialize() throws IOException { 951 return initialize(null); 952 } 953 954 /** 955 * Initialize this region. 956 * @param reporter Tickle every so often if initialize is taking a while. 957 * @return What the next sequence (edit) id should be. 958 */ 959 long initialize(final CancelableProgressable reporter) throws IOException { 960 961 // Refuse to open the region if there is no column family in the table 962 if (htableDescriptor.getColumnFamilyCount() == 0) { 963 throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString() 964 + " should have at least one column family."); 965 } 966 967 MonitoredTask status = 968 TaskMonitor.get().createStatus("Initializing region " + this, false, true); 969 long nextSeqId = -1; 970 try { 971 nextSeqId = initializeRegionInternals(reporter, status); 972 return nextSeqId; 973 } catch (IOException e) { 974 LOG.warn("Failed initialize of region= {}, starting to roll back memstore", 975 getRegionInfo().getRegionNameAsString(), e); 976 // global memstore size will be decreased when dropping memstore 977 try { 978 // drop the memory used by memstore if open region fails 979 dropMemStoreContents(); 980 } catch (IOException ioE) { 981 if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) { 982 LOG.warn( 983 "Failed drop memstore of region= {}, " 984 + "some chunks may not released forever since MSLAB is enabled", 985 getRegionInfo().getRegionNameAsString()); 986 } 987 988 } 989 if (metricsTableRequests != null) { 990 metricsTableRequests.removeRegistry(); 991 } 992 throw e; 993 } finally { 994 // nextSeqid will be -1 if the initialization fails. 995 // At least it will be 0 otherwise. 996 if (nextSeqId == -1) { 997 status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() 998 + " initialization."); 999 } 1000 if (LOG.isDebugEnabled()) { 1001 LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 1002 status.prettyPrintJournal()); 1003 } 1004 status.cleanup(); 1005 } 1006 } 1007 1008 private long initializeRegionInternals(final CancelableProgressable reporter, 1009 final MonitoredTask status) throws IOException { 1010 if (coprocessorHost != null) { 1011 status.setStatus("Running coprocessor pre-open hook"); 1012 coprocessorHost.preOpen(); 1013 } 1014 1015 String policyName = this.conf.get(REGION_STORAGE_POLICY_KEY, DEFAULT_REGION_STORAGE_POLICY); 1016 this.fs.setStoragePolicy(policyName.trim()); 1017 1018 // Write HRI to a file in case we need to recover hbase:meta 1019 // Only the primary replica should write .regioninfo 1020 if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) { 1021 status.setStatus("Writing region info on filesystem"); 1022 fs.checkRegionInfoOnFilesystem(); 1023 } 1024 1025 // Initialize all the HStores 1026 status.setStatus("Initializing all the Stores"); 1027 long maxSeqId = initializeStores(reporter, status); 1028 this.mvcc.advanceTo(maxSeqId); 1029 if (!isRestoredRegion && ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) { 1030 Collection<HStore> stores = this.stores.values(); 1031 try { 1032 // update the stores that we are replaying 1033 LOG.debug("replaying wal for " + this.getRegionInfo().getEncodedName()); 1034 stores.forEach(HStore::startReplayingFromWAL); 1035 // Recover any edits if available. 1036 maxSeqId = 1037 Math.max(maxSeqId, replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status)); 1038 // Recover any hfiles if available 1039 maxSeqId = Math.max(maxSeqId, loadRecoveredHFilesIfAny(stores)); 1040 // Make sure mvcc is up to max. 1041 this.mvcc.advanceTo(maxSeqId); 1042 } finally { 1043 LOG.debug("stopping wal replay for " + this.getRegionInfo().getEncodedName()); 1044 // update the stores that we are done replaying 1045 stores.forEach(HStore::stopReplayingFromWAL); 1046 } 1047 } 1048 this.lastReplayedOpenRegionSeqId = maxSeqId; 1049 1050 this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this)); 1051 this.writestate.flushRequested = false; 1052 this.writestate.compacting.set(0); 1053 1054 if (this.writestate.writesEnabled) { 1055 LOG.debug("Cleaning up temporary data for " + this.getRegionInfo().getEncodedName()); 1056 // Remove temporary data left over from old regions 1057 status.setStatus("Cleaning up temporary data from old regions"); 1058 fs.cleanupTempDir(); 1059 } 1060 1061 // Initialize split policy 1062 this.splitPolicy = RegionSplitPolicy.create(this, conf); 1063 1064 // Initialize split restriction 1065 splitRestriction = RegionSplitRestriction.create(getTableDescriptor(), conf); 1066 1067 // Initialize flush policy 1068 this.flushPolicy = FlushPolicyFactory.create(this, conf); 1069 1070 long lastFlushTime = EnvironmentEdgeManager.currentTime(); 1071 for (HStore store : stores.values()) { 1072 this.lastStoreFlushTimeMap.put(store, lastFlushTime); 1073 } 1074 1075 // Use maximum of log sequenceid or that which was found in stores 1076 // (particularly if no recovered edits, seqid will be -1). 1077 long nextSeqId = maxSeqId + 1; 1078 if (!isRestoredRegion) { 1079 // always get openSeqNum from the default replica, even if we are secondary replicas 1080 long maxSeqIdFromFile = WALSplitUtil.getMaxRegionSequenceId(conf, 1081 RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem, 1082 this::getWalFileSystem); 1083 nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1; 1084 // The openSeqNum will always be increase even for read only region, as we rely on it to 1085 // determine whether a region has been successfully reopened, so here we always need to update 1086 // the max sequence id file. 1087 if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) { 1088 LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName()); 1089 WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), 1090 nextSeqId - 1); 1091 // This means we have replayed all the recovered edits and also written out the max sequence 1092 // id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617 1093 // for more details. 1094 Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, 1095 getRegionInfo().getTable(), getRegionInfo().getEncodedName()); 1096 FileSystem walFs = getWalFileSystem(); 1097 if (walFs.exists(wrongRegionWALDir)) { 1098 if (!walFs.delete(wrongRegionWALDir, true)) { 1099 LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir); 1100 } 1101 } 1102 } else { 1103 lastReplayedSequenceId = nextSeqId - 1; 1104 replayLock = new ReentrantLock(); 1105 } 1106 initializeRegionReplicationSink(reporter, status); 1107 } 1108 1109 LOG.info("Opened {}; next sequenceid={}; {}, {}", this.getRegionInfo().getShortNameToLog(), 1110 nextSeqId, this.splitPolicy, this.flushPolicy); 1111 1112 // A region can be reopened if failed a split; reset flags 1113 this.closing.set(false); 1114 this.closed.set(false); 1115 1116 if (coprocessorHost != null) { 1117 LOG.debug("Running coprocessor post-open hooks for " + this.getRegionInfo().getEncodedName()); 1118 status.setStatus("Running coprocessor post-open hooks"); 1119 coprocessorHost.postOpen(); 1120 } 1121 1122 metricsTableRequests = new MetricsTableRequests(htableDescriptor.getTableName(), conf); 1123 1124 status.markComplete("Region opened successfully"); 1125 return nextSeqId; 1126 } 1127 1128 private void initializeRegionReplicationSink(CancelableProgressable reporter, 1129 MonitoredTask status) { 1130 RegionServerServices rss = getRegionServerServices(); 1131 TableDescriptor td = getTableDescriptor(); 1132 int regionReplication = td.getRegionReplication(); 1133 RegionInfo regionInfo = getRegionInfo(); 1134 if ( 1135 regionReplication <= 1 || !RegionReplicaUtil.isDefaultReplica(regionInfo) 1136 || !ServerRegionReplicaUtil.isRegionReplicaReplicationEnabled(conf, regionInfo.getTable()) 1137 || rss == null 1138 ) { 1139 regionReplicationSink = Optional.empty(); 1140 return; 1141 } 1142 status.setStatus("Initializaing region replication sink"); 1143 regionReplicationSink = Optional.of(new RegionReplicationSink(conf, regionInfo, td, 1144 rss.getRegionReplicationBufferManager(), () -> rss.getFlushRequester().requestFlush(this, 1145 new ArrayList<>(td.getColumnFamilyNames()), FlushLifeCycleTracker.DUMMY), 1146 rss.getAsyncClusterConnection())); 1147 } 1148 1149 /** 1150 * Open all Stores. 1151 * @return Highest sequenceId found out in a Store. 1152 */ 1153 private long initializeStores(CancelableProgressable reporter, MonitoredTask status) 1154 throws IOException { 1155 return initializeStores(reporter, status, false); 1156 } 1157 1158 private long initializeStores(CancelableProgressable reporter, MonitoredTask status, 1159 boolean warmup) throws IOException { 1160 // Load in all the HStores. 1161 long maxSeqId = -1; 1162 // initialized to -1 so that we pick up MemstoreTS from column families 1163 long maxMemstoreTS = -1; 1164 1165 if (htableDescriptor.getColumnFamilyCount() != 0) { 1166 // initialize the thread pool for opening stores in parallel. 1167 ThreadPoolExecutor storeOpenerThreadPool = 1168 getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog()); 1169 CompletionService<HStore> completionService = 1170 new ExecutorCompletionService<>(storeOpenerThreadPool); 1171 1172 // initialize each store in parallel 1173 for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) { 1174 status.setStatus("Instantiating store for column family " + family); 1175 completionService.submit(new Callable<HStore>() { 1176 @Override 1177 public HStore call() throws IOException { 1178 return instantiateHStore(family, warmup); 1179 } 1180 }); 1181 } 1182 boolean allStoresOpened = false; 1183 boolean hasSloppyStores = false; 1184 try { 1185 for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) { 1186 Future<HStore> future = completionService.take(); 1187 HStore store = future.get(); 1188 this.stores.put(store.getColumnFamilyDescriptor().getName(), store); 1189 if (store.isSloppyMemStore()) { 1190 hasSloppyStores = true; 1191 } 1192 1193 long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L); 1194 maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()), storeMaxSequenceId); 1195 if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) { 1196 maxSeqId = storeMaxSequenceId; 1197 } 1198 long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L); 1199 if (maxStoreMemstoreTS > maxMemstoreTS) { 1200 maxMemstoreTS = maxStoreMemstoreTS; 1201 } 1202 } 1203 allStoresOpened = true; 1204 if (hasSloppyStores) { 1205 htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor) 1206 .setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName()).build(); 1207 LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this); 1208 } 1209 } catch (InterruptedException e) { 1210 throw throwOnInterrupt(e); 1211 } catch (ExecutionException e) { 1212 throw new IOException(e.getCause()); 1213 } finally { 1214 storeOpenerThreadPool.shutdownNow(); 1215 if (!allStoresOpened) { 1216 // something went wrong, close all opened stores 1217 LOG.error("Could not initialize all stores for the region=" + this); 1218 for (HStore store : this.stores.values()) { 1219 try { 1220 store.close(); 1221 } catch (IOException e) { 1222 LOG.warn("close store {} failed in region {}", store.toString(), this, e); 1223 } 1224 } 1225 } 1226 } 1227 } 1228 return Math.max(maxSeqId, maxMemstoreTS + 1); 1229 } 1230 1231 private void initializeWarmup(final CancelableProgressable reporter) throws IOException { 1232 MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this); 1233 // Initialize all the HStores 1234 status.setStatus("Warmup all stores of " + this.getRegionInfo().getRegionNameAsString()); 1235 try { 1236 initializeStores(reporter, status, true); 1237 } finally { 1238 status.markComplete("Warmed up " + this.getRegionInfo().getRegionNameAsString()); 1239 } 1240 } 1241 1242 /** Returns Map of StoreFiles by column family */ 1243 private NavigableMap<byte[], List<Path>> getStoreFiles() { 1244 NavigableMap<byte[], List<Path>> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); 1245 for (HStore store : stores.values()) { 1246 Collection<HStoreFile> storeFiles = store.getStorefiles(); 1247 if (storeFiles == null) { 1248 continue; 1249 } 1250 List<Path> storeFileNames = new ArrayList<>(); 1251 for (HStoreFile storeFile : storeFiles) { 1252 storeFileNames.add(storeFile.getPath()); 1253 } 1254 allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames); 1255 } 1256 return allStoreFiles; 1257 } 1258 1259 protected void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException { 1260 Map<byte[], List<Path>> storeFiles = getStoreFiles(); 1261 RegionEventDescriptor regionOpenDesc = 1262 ProtobufUtil.toRegionEventDescriptor(RegionEventDescriptor.EventType.REGION_OPEN, 1263 getRegionInfo(), openSeqId, getRegionServerServices().getServerName(), storeFiles); 1264 WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc, 1265 mvcc, regionReplicationSink.orElse(null)); 1266 } 1267 1268 private void writeRegionCloseMarker(WAL wal) throws IOException { 1269 Map<byte[], List<Path>> storeFiles = getStoreFiles(); 1270 RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor( 1271 RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(), 1272 getRegionServerServices().getServerName(), storeFiles); 1273 // we do not care region close event at secondary replica side so just pass a null 1274 // RegionReplicationSink 1275 WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc, 1276 mvcc, null); 1277 1278 // Store SeqId in WAL FileSystem when a region closes 1279 // checking region folder exists is due to many tests which delete the table folder while a 1280 // table is still online 1281 if (getWalFileSystem().exists(getWALRegionDir())) { 1282 WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), 1283 mvcc.getReadPoint()); 1284 } 1285 } 1286 1287 /** Returns True if this region has references. */ 1288 public boolean hasReferences() { 1289 return stores.values().stream().anyMatch(HStore::hasReferences); 1290 } 1291 1292 public void blockUpdates() { 1293 this.updatesLock.writeLock().lock(); 1294 } 1295 1296 public void unblockUpdates() { 1297 this.updatesLock.writeLock().unlock(); 1298 } 1299 1300 public HDFSBlocksDistribution getHDFSBlocksDistribution() { 1301 HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); 1302 stores.values().stream().filter(s -> s.getStorefiles() != null) 1303 .flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution) 1304 .forEachOrdered(hdfsBlocksDistribution::add); 1305 return hdfsBlocksDistribution; 1306 } 1307 1308 /** 1309 * This is a helper function to compute HDFS block distribution on demand 1310 * @param conf configuration 1311 * @param tableDescriptor TableDescriptor of the table 1312 * @param regionInfo encoded name of the region 1313 * @return The HDFS blocks distribution for the given region. 1314 */ 1315 public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, 1316 TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException { 1317 Path tablePath = 1318 CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), tableDescriptor.getTableName()); 1319 return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath); 1320 } 1321 1322 /** 1323 * This is a helper function to compute HDFS block distribution on demand 1324 * @param conf configuration 1325 * @param tableDescriptor TableDescriptor of the table 1326 * @param regionInfo encoded name of the region 1327 * @param tablePath the table directory 1328 * @return The HDFS blocks distribution for the given region. 1329 */ 1330 public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, 1331 TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException { 1332 HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); 1333 FileSystem fs = tablePath.getFileSystem(conf); 1334 1335 HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo); 1336 for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) { 1337 List<LocatedFileStatus> locatedFileStatusList = 1338 HRegionFileSystem.getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true); 1339 if (locatedFileStatusList == null) { 1340 continue; 1341 } 1342 1343 for (LocatedFileStatus status : locatedFileStatusList) { 1344 Path p = status.getPath(); 1345 if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) { 1346 // Only construct StoreFileInfo object if its not a hfile, save obj 1347 // creation 1348 StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status); 1349 hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs)); 1350 } else if (StoreFileInfo.isHFile(p)) { 1351 // If its a HFile, then lets just add to the block distribution 1352 // lets not create more objects here, not even another HDFSBlocksDistribution 1353 FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations()); 1354 } else { 1355 throw new IOException("path=" + p + " doesn't look like a valid StoreFile"); 1356 } 1357 } 1358 } 1359 return hdfsBlocksDistribution; 1360 } 1361 1362 /** 1363 * Increase the size of mem store in this region and the size of global mem store 1364 */ 1365 private void incMemStoreSize(MemStoreSize mss) { 1366 incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), 1367 mss.getCellsCount()); 1368 } 1369 1370 void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, 1371 int cellsCountDelta) { 1372 if (this.rsAccounting != null) { 1373 rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); 1374 } 1375 long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta, 1376 offHeapSizeDelta, cellsCountDelta); 1377 checkNegativeMemStoreDataSize(dataSize, dataSizeDelta); 1378 } 1379 1380 void decrMemStoreSize(MemStoreSize mss) { 1381 decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), 1382 mss.getCellsCount()); 1383 } 1384 1385 private void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, 1386 int cellsCountDelta) { 1387 if (this.rsAccounting != null) { 1388 rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); 1389 } 1390 long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta, 1391 offHeapSizeDelta, cellsCountDelta); 1392 checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta); 1393 } 1394 1395 private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) { 1396 // This is extremely bad if we make memStoreSizing negative. Log as much info on the offending 1397 // caller as possible. (memStoreSizing might be a negative value already -- freeing memory) 1398 if (memStoreDataSize < 0) { 1399 LOG.error("Asked to modify this region's (" + this.toString() 1400 + ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing=" 1401 + (memStoreDataSize - delta) + ", delta=" + delta, new Exception()); 1402 } 1403 } 1404 1405 @Override 1406 public RegionInfo getRegionInfo() { 1407 return this.fs.getRegionInfo(); 1408 } 1409 1410 /** 1411 * Returns Instance of {@link RegionServerServices} used by this HRegion. Can be null. 1412 */ 1413 RegionServerServices getRegionServerServices() { 1414 return this.rsServices; 1415 } 1416 1417 @Override 1418 public long getReadRequestsCount() { 1419 return readRequestsCount.sum(); 1420 } 1421 1422 @Override 1423 public long getCpRequestsCount() { 1424 return cpRequestsCount.sum(); 1425 } 1426 1427 @Override 1428 public long getFilteredReadRequestsCount() { 1429 return filteredReadRequestsCount.sum(); 1430 } 1431 1432 @Override 1433 public long getWriteRequestsCount() { 1434 return writeRequestsCount.sum(); 1435 } 1436 1437 @Override 1438 public long getMemStoreDataSize() { 1439 return memStoreSizing.getDataSize(); 1440 } 1441 1442 @Override 1443 public long getMemStoreHeapSize() { 1444 return memStoreSizing.getHeapSize(); 1445 } 1446 1447 @Override 1448 public long getMemStoreOffHeapSize() { 1449 return memStoreSizing.getOffHeapSize(); 1450 } 1451 1452 /** Returns store services for this region, to access services required by store level needs */ 1453 public RegionServicesForStores getRegionServicesForStores() { 1454 return regionServicesForStores; 1455 } 1456 1457 @Override 1458 public long getNumMutationsWithoutWAL() { 1459 return numMutationsWithoutWAL.sum(); 1460 } 1461 1462 @Override 1463 public long getDataInMemoryWithoutWAL() { 1464 return dataInMemoryWithoutWAL.sum(); 1465 } 1466 1467 @Override 1468 public long getBlockedRequestsCount() { 1469 return blockedRequestsCount.sum(); 1470 } 1471 1472 @Override 1473 public long getCheckAndMutateChecksPassed() { 1474 return checkAndMutateChecksPassed.sum(); 1475 } 1476 1477 @Override 1478 public long getCheckAndMutateChecksFailed() { 1479 return checkAndMutateChecksFailed.sum(); 1480 } 1481 1482 // TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing 1483 // the op and bypassing the core, this might be needed? Should be stop supporting the bypass 1484 // feature? 1485 public MetricsRegion getMetrics() { 1486 return metricsRegion; 1487 } 1488 1489 @Override 1490 public boolean isClosed() { 1491 return this.closed.get(); 1492 } 1493 1494 @Override 1495 public boolean isClosing() { 1496 return this.closing.get(); 1497 } 1498 1499 @Override 1500 public boolean isReadOnly() { 1501 return this.writestate.isReadOnly(); 1502 } 1503 1504 @Override 1505 public boolean isAvailable() { 1506 return !isClosed() && !isClosing(); 1507 } 1508 1509 @Override 1510 public boolean isSplittable() { 1511 return splitPolicy.canSplit(); 1512 } 1513 1514 @Override 1515 public boolean isMergeable() { 1516 if (!isAvailable()) { 1517 LOG.debug("Region " + this + " is not mergeable because it is closing or closed"); 1518 return false; 1519 } 1520 if (hasReferences()) { 1521 LOG.debug("Region " + this + " is not mergeable because it has references"); 1522 return false; 1523 } 1524 1525 return true; 1526 } 1527 1528 public boolean areWritesEnabled() { 1529 synchronized (this.writestate) { 1530 return this.writestate.writesEnabled; 1531 } 1532 } 1533 1534 public MultiVersionConcurrencyControl getMVCC() { 1535 return mvcc; 1536 } 1537 1538 @Override 1539 public long getMaxFlushedSeqId() { 1540 return maxFlushedSeqId; 1541 } 1542 1543 /** Returns readpoint considering given IsolationLevel. Pass {@code null} for default */ 1544 public long getReadPoint(IsolationLevel isolationLevel) { 1545 if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) { 1546 // This scan can read even uncommitted transactions 1547 return Long.MAX_VALUE; 1548 } 1549 return mvcc.getReadPoint(); 1550 } 1551 1552 public boolean isLoadingCfsOnDemandDefault() { 1553 return this.isLoadingCfsOnDemandDefault; 1554 } 1555 1556 /** 1557 * Close down this HRegion. Flush the cache, shut down each HStore, don't service any more calls. 1558 * <p> 1559 * This method could take some time to execute, so don't call it from a time-sensitive thread. 1560 * @return Vector of all the storage files that the HRegion's component HStores make use of. It's 1561 * a list of all StoreFile objects. Returns empty vector if already closed and null if 1562 * judged that it should not close. 1563 * @throws IOException e 1564 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 1565 * not properly persisted. The region is put in closing mode, and 1566 * the caller MUST abort after this. 1567 */ 1568 public Map<byte[], List<HStoreFile>> close() throws IOException { 1569 return close(false); 1570 } 1571 1572 private final Object closeLock = new Object(); 1573 1574 /** Conf key for fair locking policy */ 1575 public static final String FAIR_REENTRANT_CLOSE_LOCK = 1576 "hbase.regionserver.fair.region.close.lock"; 1577 public static final boolean DEFAULT_FAIR_REENTRANT_CLOSE_LOCK = true; 1578 /** Conf key for the periodic flush interval */ 1579 public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL = 1580 "hbase.regionserver.optionalcacheflushinterval"; 1581 /** Default interval for the memstore flush */ 1582 public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000; 1583 /** Default interval for System tables memstore flush */ 1584 public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes 1585 1586 /** Conf key to force a flush if there are already enough changes for one region in memstore */ 1587 public static final String MEMSTORE_FLUSH_PER_CHANGES = "hbase.regionserver.flush.per.changes"; 1588 public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions 1589 /** 1590 * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes 1591 * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region 1592 */ 1593 public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G 1594 1595 public static final String CLOSE_WAIT_ABORT = "hbase.regionserver.close.wait.abort"; 1596 public static final boolean DEFAULT_CLOSE_WAIT_ABORT = true; 1597 public static final String CLOSE_WAIT_TIME = "hbase.regionserver.close.wait.time.ms"; 1598 public static final long DEFAULT_CLOSE_WAIT_TIME = 60000; // 1 minute 1599 public static final String CLOSE_WAIT_INTERVAL = "hbase.regionserver.close.wait.interval.ms"; 1600 public static final long DEFAULT_CLOSE_WAIT_INTERVAL = 10000; // 10 seconds 1601 1602 public Map<byte[], List<HStoreFile>> close(boolean abort) throws IOException { 1603 return close(abort, false); 1604 } 1605 1606 /** 1607 * Close this HRegion. 1608 * @param abort true if server is aborting (only during testing) 1609 * @param ignoreStatus true if ignore the status (won't be showed on task list) 1610 * @return Vector of all the storage files that the HRegion's component HStores make use of. It's 1611 * a list of StoreFile objects. Can be null if we are not to close at this time, or we are 1612 * already closed. 1613 * @throws IOException e 1614 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 1615 * not properly persisted. The region is put in closing mode, and 1616 * the caller MUST abort after this. 1617 */ 1618 public Map<byte[], List<HStoreFile>> close(boolean abort, boolean ignoreStatus) 1619 throws IOException { 1620 return close(abort, ignoreStatus, false); 1621 } 1622 1623 /** 1624 * Close down this HRegion. Flush the cache unless abort parameter is true, Shut down each HStore, 1625 * don't service any more calls. This method could take some time to execute, so don't call it 1626 * from a time-sensitive thread. 1627 * @param abort true if server is aborting (only during testing) 1628 * @param ignoreStatus true if ignore the status (wont be showed on task list) 1629 * @param isGracefulStop true if region is being closed during graceful stop and the blocks in the 1630 * BucketCache should not be evicted. 1631 * @return Vector of all the storage files that the HRegion's component HStores make use of. It's 1632 * a list of StoreFile objects. Can be null if we are not to close at this time or we are 1633 * already closed. 1634 * @throws IOException e 1635 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 1636 * not properly persisted. The region is put in closing mode, and 1637 * the caller MUST abort after this. 1638 */ 1639 public Map<byte[], List<HStoreFile>> close(boolean abort, boolean ignoreStatus, 1640 boolean isGracefulStop) throws IOException { 1641 // Only allow one thread to close at a time. Serialize them so dual 1642 // threads attempting to close will run up against each other. 1643 MonitoredTask status = TaskMonitor.get().createStatus( 1644 "Closing region " + this.getRegionInfo().getEncodedName() + (abort ? " due to abort" : ""), 1645 ignoreStatus, true); 1646 status.setStatus("Waiting for close lock"); 1647 try { 1648 synchronized (closeLock) { 1649 if (isGracefulStop && rsServices != null) { 1650 rsServices.getBlockCache().ifPresent(blockCache -> { 1651 if (blockCache instanceof CombinedBlockCache) { 1652 BlockCache l2 = ((CombinedBlockCache) blockCache).getSecondLevelCache(); 1653 if (l2 instanceof BucketCache) { 1654 if (((BucketCache) l2).isCachePersistenceEnabled()) { 1655 LOG.info( 1656 "Closing region {} during a graceful stop, and cache persistence is on, " 1657 + "so setting evict on close to false. ", 1658 this.getRegionInfo().getRegionNameAsString()); 1659 this.getStores().forEach(s -> s.getCacheConfig().setEvictOnClose(false)); 1660 } 1661 } 1662 } 1663 }); 1664 } 1665 return doClose(abort, status); 1666 } 1667 } finally { 1668 if (LOG.isDebugEnabled()) { 1669 LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 1670 status.prettyPrintJournal()); 1671 } 1672 status.cleanup(); 1673 } 1674 } 1675 1676 /** 1677 * Exposed for some very specific unit tests. 1678 */ 1679 public void setClosing(boolean closing) { 1680 this.closing.set(closing); 1681 } 1682 1683 /** 1684 * The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the 1685 * unit test. Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the 1686 * timeout. 1687 * @param timeoutForWriteLock the second time to wait for the write lock in 1688 * {@link HRegion#doClose} 1689 */ 1690 public void setTimeoutForWriteLock(long timeoutForWriteLock) { 1691 assert timeoutForWriteLock >= 0; 1692 this.timeoutForWriteLock = timeoutForWriteLock; 1693 } 1694 1695 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UL_UNRELEASED_LOCK_EXCEPTION_PATH", 1696 justification = "I think FindBugs is confused") 1697 private Map<byte[], List<HStoreFile>> doClose(boolean abort, MonitoredTask status) 1698 throws IOException { 1699 if (isClosed()) { 1700 LOG.warn("Region " + this + " already closed"); 1701 return null; 1702 } 1703 1704 if (coprocessorHost != null) { 1705 status.setStatus("Running coprocessor pre-close hooks"); 1706 this.coprocessorHost.preClose(abort); 1707 } 1708 status.setStatus("Disabling compacts and flushes for region"); 1709 boolean canFlush = true; 1710 synchronized (writestate) { 1711 // Disable compacting and flushing by background threads for this 1712 // region. 1713 canFlush = !writestate.readOnly; 1714 writestate.writesEnabled = false; 1715 LOG.debug("Closing {}, disabling compactions & flushes", 1716 this.getRegionInfo().getEncodedName()); 1717 waitForFlushesAndCompactions(); 1718 } 1719 // If we were not just flushing, is it worth doing a preflush...one 1720 // that will clear out of the bulk of the memstore before we put up 1721 // the close flag? 1722 if (!abort && worthPreFlushing() && canFlush) { 1723 status.setStatus("Pre-flushing region before close"); 1724 LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName()); 1725 try { 1726 internalFlushcache(status); 1727 } catch (IOException ioe) { 1728 // Failed to flush the region. Keep going. 1729 status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage()); 1730 } 1731 } 1732 if (regionReplicationSink.isPresent()) { 1733 // stop replicating to secondary replicas 1734 // the open event marker can make secondary replicas refresh store files and catch up 1735 // everything, so here we just give up replicating later edits, to speed up the reopen process 1736 RegionReplicationSink sink = regionReplicationSink.get(); 1737 sink.stop(); 1738 try { 1739 regionReplicationSink.get().waitUntilStopped(); 1740 } catch (InterruptedException e) { 1741 throw throwOnInterrupt(e); 1742 } 1743 } 1744 // Set the closing flag 1745 // From this point new arrivals at the region lock will get NSRE. 1746 1747 this.closing.set(true); 1748 LOG.info("Closing region {}", this); 1749 1750 // Acquire the close lock 1751 1752 // The configuration parameter CLOSE_WAIT_ABORT is overloaded to enable both 1753 // the new regionserver abort condition and interrupts for running requests. 1754 // If CLOSE_WAIT_ABORT is not enabled there is no change from earlier behavior, 1755 // we will not attempt to interrupt threads servicing requests nor crash out 1756 // the regionserver if something remains stubborn. 1757 1758 final boolean canAbort = conf.getBoolean(CLOSE_WAIT_ABORT, DEFAULT_CLOSE_WAIT_ABORT); 1759 boolean useTimedWait = false; 1760 if (timeoutForWriteLock != null && timeoutForWriteLock != Long.MAX_VALUE) { 1761 // convert legacy use of timeoutForWriteLock in seconds to new use in millis 1762 timeoutForWriteLock = TimeUnit.SECONDS.toMillis(timeoutForWriteLock); 1763 useTimedWait = true; 1764 } else if (canAbort) { 1765 timeoutForWriteLock = conf.getLong(CLOSE_WAIT_TIME, DEFAULT_CLOSE_WAIT_TIME); 1766 useTimedWait = true; 1767 } 1768 if (LOG.isDebugEnabled()) { 1769 LOG.debug((useTimedWait ? "Time limited wait" : "Waiting without time limit") 1770 + " for close lock on " + this); 1771 } 1772 final long closeWaitInterval = conf.getLong(CLOSE_WAIT_INTERVAL, DEFAULT_CLOSE_WAIT_INTERVAL); 1773 long elapsedWaitTime = 0; 1774 if (useTimedWait) { 1775 // Sanity check configuration 1776 long remainingWaitTime = timeoutForWriteLock; 1777 if (remainingWaitTime < closeWaitInterval) { 1778 LOG.warn("Time limit for close wait of " + timeoutForWriteLock 1779 + " ms is less than the configured lock acquisition wait interval " + closeWaitInterval 1780 + " ms, using wait interval as time limit"); 1781 remainingWaitTime = closeWaitInterval; 1782 } 1783 boolean acquired = false; 1784 do { 1785 long start = EnvironmentEdgeManager.currentTime(); 1786 try { 1787 acquired = lock.writeLock().tryLock(Math.min(remainingWaitTime, closeWaitInterval), 1788 TimeUnit.MILLISECONDS); 1789 } catch (InterruptedException e) { 1790 // Interrupted waiting for close lock. More likely the server is shutting down, not 1791 // normal operation, so aborting upon interrupt while waiting on this lock would not 1792 // provide much value. Throw an IOE (as IIOE) like we would in the case where we 1793 // fail to acquire the lock. 1794 String msg = "Interrupted while waiting for close lock on " + this; 1795 LOG.warn(msg, e); 1796 throw (InterruptedIOException) new InterruptedIOException(msg).initCause(e); 1797 } 1798 long elapsed = EnvironmentEdgeManager.currentTime() - start; 1799 elapsedWaitTime += elapsed; 1800 remainingWaitTime -= elapsed; 1801 if (canAbort && !acquired && remainingWaitTime > 0) { 1802 // Before we loop to wait again, interrupt all region operations that might 1803 // still be in progress, to encourage them to break out of waiting states or 1804 // inner loops, throw an exception to clients, and release the read lock via 1805 // endRegionOperation. 1806 if (LOG.isDebugEnabled()) { 1807 LOG.debug("Interrupting region operations after waiting for close lock for " 1808 + elapsedWaitTime + " ms on " + this + ", " + remainingWaitTime + " ms remaining"); 1809 } 1810 interruptRegionOperations(); 1811 } 1812 } while (!acquired && remainingWaitTime > 0); 1813 1814 // If we fail to acquire the lock, trigger an abort if we can; otherwise throw an IOE 1815 // to let the caller know we could not proceed with the close. 1816 if (!acquired) { 1817 String msg = 1818 "Failed to acquire close lock on " + this + " after waiting " + elapsedWaitTime + " ms"; 1819 LOG.error(msg); 1820 if (canAbort) { 1821 // If we failed to acquire the write lock, abort the server 1822 rsServices.abort(msg, null); 1823 } 1824 throw new IOException(msg); 1825 } 1826 1827 } else { 1828 1829 long start = EnvironmentEdgeManager.currentTime(); 1830 lock.writeLock().lock(); 1831 elapsedWaitTime = EnvironmentEdgeManager.currentTime() - start; 1832 1833 } 1834 1835 if (LOG.isDebugEnabled()) { 1836 LOG.debug("Acquired close lock on " + this + " after waiting " + elapsedWaitTime + " ms"); 1837 } 1838 1839 status.setStatus("Disabling writes for close"); 1840 try { 1841 if (this.isClosed()) { 1842 status.abort("Already got closed by another process"); 1843 // SplitTransaction handles the null 1844 return null; 1845 } 1846 LOG.debug("Updates disabled for region " + this); 1847 // Don't flush the cache if we are aborting 1848 if (!abort && canFlush) { 1849 int failedfFlushCount = 0; 1850 int flushCount = 0; 1851 long tmp = 0; 1852 long remainingSize = this.memStoreSizing.getDataSize(); 1853 while (remainingSize > 0) { 1854 try { 1855 internalFlushcache(status); 1856 if (flushCount > 0) { 1857 LOG.info("Running extra flush, " + flushCount + " (carrying snapshot?) " + this); 1858 } 1859 flushCount++; 1860 tmp = this.memStoreSizing.getDataSize(); 1861 if (tmp >= remainingSize) { 1862 failedfFlushCount++; 1863 } 1864 remainingSize = tmp; 1865 if (failedfFlushCount > 5) { 1866 // If we failed 5 times and are unable to clear memory, abort 1867 // so we do not lose data 1868 throw new DroppedSnapshotException("Failed clearing memory after " + flushCount 1869 + " attempts on region: " + Bytes.toStringBinary(getRegionInfo().getRegionName())); 1870 } 1871 } catch (IOException ioe) { 1872 status.setStatus("Failed flush " + this + ", putting online again"); 1873 synchronized (writestate) { 1874 writestate.writesEnabled = true; 1875 } 1876 // Have to throw to upper layers. I can't abort server from here. 1877 throw ioe; 1878 } 1879 } 1880 } 1881 1882 Map<byte[], List<HStoreFile>> result = new TreeMap<>(Bytes.BYTES_COMPARATOR); 1883 if (!stores.isEmpty()) { 1884 // initialize the thread pool for closing stores in parallel. 1885 ThreadPoolExecutor storeCloserThreadPool = 1886 getStoreOpenAndCloseThreadPool("StoreCloser-" + getRegionInfo().getRegionNameAsString()); 1887 CompletionService<Pair<byte[], Collection<HStoreFile>>> completionService = 1888 new ExecutorCompletionService<>(storeCloserThreadPool); 1889 1890 // close each store in parallel 1891 for (HStore store : stores.values()) { 1892 MemStoreSize mss = store.getFlushableSize(); 1893 if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) { 1894 if (getRegionServerServices() != null) { 1895 getRegionServerServices().abort("Assertion failed while closing store " 1896 + getRegionInfo().getRegionNameAsString() + " " + store 1897 + ". flushableSize expected=0, actual={" + mss + "}. Current memStoreSize=" 1898 + this.memStoreSizing.getMemStoreSize() + ". Maybe a coprocessor " 1899 + "operation failed and left the memstore in a partially updated state.", null); 1900 } 1901 } 1902 completionService.submit(new Callable<Pair<byte[], Collection<HStoreFile>>>() { 1903 @Override 1904 public Pair<byte[], Collection<HStoreFile>> call() throws IOException { 1905 return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close()); 1906 } 1907 }); 1908 } 1909 try { 1910 for (int i = 0; i < stores.size(); i++) { 1911 Future<Pair<byte[], Collection<HStoreFile>>> future = completionService.take(); 1912 Pair<byte[], Collection<HStoreFile>> storeFiles = future.get(); 1913 List<HStoreFile> familyFiles = result.get(storeFiles.getFirst()); 1914 if (familyFiles == null) { 1915 familyFiles = new ArrayList<>(); 1916 result.put(storeFiles.getFirst(), familyFiles); 1917 } 1918 familyFiles.addAll(storeFiles.getSecond()); 1919 } 1920 } catch (InterruptedException e) { 1921 throw throwOnInterrupt(e); 1922 } catch (ExecutionException e) { 1923 Throwable cause = e.getCause(); 1924 if (cause instanceof IOException) { 1925 throw (IOException) cause; 1926 } 1927 throw new IOException(cause); 1928 } finally { 1929 storeCloserThreadPool.shutdownNow(); 1930 } 1931 } 1932 1933 status.setStatus("Writing region close event to WAL"); 1934 // Always write close marker to wal even for read only table. This is not a big problem as we 1935 // do not write any data into the region; it is just a meta edit in the WAL file. 1936 if ( 1937 !abort && wal != null && getRegionServerServices() != null 1938 && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) 1939 ) { 1940 writeRegionCloseMarker(wal); 1941 } 1942 this.closed.set(true); 1943 1944 // Decrease refCount of table latency metric registry. 1945 // Do this after closed#set to make sure only -1. 1946 if (metricsTableRequests != null) { 1947 metricsTableRequests.removeRegistry(); 1948 } 1949 1950 if (!canFlush) { 1951 decrMemStoreSize(this.memStoreSizing.getMemStoreSize()); 1952 } else if (this.memStoreSizing.getDataSize() != 0) { 1953 LOG.error("Memstore data size is {} in region {}", this.memStoreSizing.getDataSize(), this); 1954 } 1955 if (coprocessorHost != null) { 1956 status.setStatus("Running coprocessor post-close hooks"); 1957 this.coprocessorHost.postClose(abort); 1958 } 1959 if (this.metricsRegion != null) { 1960 this.metricsRegion.close(); 1961 } 1962 if (this.metricsRegionWrapper != null) { 1963 Closeables.close(this.metricsRegionWrapper, true); 1964 } 1965 status.markComplete("Closed"); 1966 LOG.info("Closed {}", this); 1967 return result; 1968 } finally { 1969 lock.writeLock().unlock(); 1970 } 1971 } 1972 1973 /** Wait for all current flushes and compactions of the region to complete */ 1974 // TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for 1975 // Phoenix needs. 1976 public void waitForFlushesAndCompactions() { 1977 synchronized (writestate) { 1978 if (this.writestate.readOnly) { 1979 // we should not wait for replayed flushed if we are read only (for example in case the 1980 // region is a secondary replica). 1981 return; 1982 } 1983 boolean interrupted = false; 1984 try { 1985 while (writestate.compacting.get() > 0 || writestate.flushing) { 1986 LOG.debug("waiting for " + writestate.compacting + " compactions" 1987 + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this); 1988 try { 1989 writestate.wait(); 1990 } catch (InterruptedException iex) { 1991 // essentially ignore and propagate the interrupt back up 1992 LOG.warn("Interrupted while waiting in region {}", this); 1993 interrupted = true; 1994 break; 1995 } 1996 } 1997 } finally { 1998 if (interrupted) { 1999 Thread.currentThread().interrupt(); 2000 } 2001 } 2002 } 2003 } 2004 2005 /** 2006 * Wait for all current flushes of the region to complete 2007 */ 2008 public void waitForFlushes() { 2009 waitForFlushes(0);// Unbound wait 2010 } 2011 2012 @Override 2013 public boolean waitForFlushes(long timeout) { 2014 synchronized (writestate) { 2015 if (this.writestate.readOnly) { 2016 // we should not wait for replayed flushed if we are read only (for example in case the 2017 // region is a secondary replica). 2018 return true; 2019 } 2020 if (!writestate.flushing) return true; 2021 long start = EnvironmentEdgeManager.currentTime(); 2022 long duration = 0; 2023 boolean interrupted = false; 2024 LOG.debug("waiting for cache flush to complete for region " + this); 2025 try { 2026 while (writestate.flushing) { 2027 if (timeout > 0 && duration >= timeout) break; 2028 try { 2029 long toWait = timeout == 0 ? 0 : (timeout - duration); 2030 writestate.wait(toWait); 2031 } catch (InterruptedException iex) { 2032 // essentially ignore and propagate the interrupt back up 2033 LOG.warn("Interrupted while waiting in region {}", this); 2034 interrupted = true; 2035 break; 2036 } finally { 2037 duration = EnvironmentEdgeManager.currentTime() - start; 2038 } 2039 } 2040 } finally { 2041 if (interrupted) { 2042 Thread.currentThread().interrupt(); 2043 } 2044 } 2045 LOG.debug("Waited {} ms for region {} flush to complete", duration, this); 2046 return !(writestate.flushing); 2047 } 2048 } 2049 2050 @Override 2051 public Configuration getReadOnlyConfiguration() { 2052 return new ReadOnlyConfiguration(this.conf); 2053 } 2054 2055 @Override 2056 public int getMinBlockSizeBytes() { 2057 return minBlockSizeBytes; 2058 } 2059 2060 private ThreadPoolExecutor getStoreOpenAndCloseThreadPool(final String threadNamePrefix) { 2061 int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); 2062 int maxThreads = Math.min(numStores, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, 2063 HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)); 2064 return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); 2065 } 2066 2067 ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(final String threadNamePrefix) { 2068 int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); 2069 int maxThreads = Math.max(1, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, 2070 HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX) / numStores); 2071 return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); 2072 } 2073 2074 private static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads, 2075 final String threadNamePrefix) { 2076 return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS, 2077 new ThreadFactory() { 2078 private int count = 1; 2079 2080 @Override 2081 public Thread newThread(Runnable r) { 2082 return new Thread(r, threadNamePrefix + "-" + count++); 2083 } 2084 }); 2085 } 2086 2087 /** Returns True if its worth doing a flush before we put up the close flag. */ 2088 private boolean worthPreFlushing() { 2089 return this.memStoreSizing.getDataSize() 2090 > this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5); 2091 } 2092 2093 ////////////////////////////////////////////////////////////////////////////// 2094 // HRegion accessors 2095 ////////////////////////////////////////////////////////////////////////////// 2096 2097 @Override 2098 public TableDescriptor getTableDescriptor() { 2099 return this.htableDescriptor; 2100 } 2101 2102 public void setTableDescriptor(TableDescriptor desc) { 2103 htableDescriptor = desc; 2104 } 2105 2106 /** Returns WAL in use for this region */ 2107 public WAL getWAL() { 2108 return this.wal; 2109 } 2110 2111 public BlockCache getBlockCache() { 2112 return this.blockCache; 2113 } 2114 2115 /** 2116 * Only used for unit test which doesn't start region server. 2117 */ 2118 public void setBlockCache(BlockCache blockCache) { 2119 this.blockCache = blockCache; 2120 } 2121 2122 public MobFileCache getMobFileCache() { 2123 return this.mobFileCache; 2124 } 2125 2126 /** 2127 * Only used for unit test which doesn't start region server. 2128 */ 2129 public void setMobFileCache(MobFileCache mobFileCache) { 2130 this.mobFileCache = mobFileCache; 2131 } 2132 2133 /** Returns split policy for this region. */ 2134 RegionSplitPolicy getSplitPolicy() { 2135 return this.splitPolicy; 2136 } 2137 2138 /** 2139 * A split takes the config from the parent region & passes it to the daughter region's 2140 * constructor. If 'conf' was passed, you would end up using the HTD of the parent region in 2141 * addition to the new daughter HTD. Pass 'baseConf' to the daughter regions to avoid this tricky 2142 * dedupe problem. 2143 * @return Configuration object 2144 */ 2145 Configuration getBaseConf() { 2146 return this.baseConf; 2147 } 2148 2149 /** Returns {@link FileSystem} being used by this region */ 2150 public FileSystem getFilesystem() { 2151 return fs.getFileSystem(); 2152 } 2153 2154 /** Returns the {@link HRegionFileSystem} used by this region */ 2155 public HRegionFileSystem getRegionFileSystem() { 2156 return this.fs; 2157 } 2158 2159 /** Returns the WAL {@link HRegionFileSystem} used by this region */ 2160 HRegionWALFileSystem getRegionWALFileSystem() throws IOException { 2161 return new HRegionWALFileSystem(conf, getWalFileSystem(), 2162 CommonFSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo()); 2163 } 2164 2165 /** Returns the WAL {@link FileSystem} being used by this region */ 2166 FileSystem getWalFileSystem() throws IOException { 2167 if (walFS == null) { 2168 walFS = CommonFSUtils.getWALFileSystem(conf); 2169 } 2170 return walFS; 2171 } 2172 2173 /** 2174 * @return the Region directory under WALRootDirectory 2175 * @throws IOException if there is an error getting WALRootDir 2176 */ 2177 public Path getWALRegionDir() throws IOException { 2178 if (regionWalDir == null) { 2179 regionWalDir = CommonFSUtils.getWALRegionDir(conf, getRegionInfo().getTable(), 2180 getRegionInfo().getEncodedName()); 2181 } 2182 return regionWalDir; 2183 } 2184 2185 @Override 2186 public long getEarliestFlushTimeForAllStores() { 2187 return Collections.min(lastStoreFlushTimeMap.values()); 2188 } 2189 2190 @Override 2191 public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException { 2192 long result = Long.MAX_VALUE; 2193 for (HStore store : stores.values()) { 2194 Collection<HStoreFile> storeFiles = store.getStorefiles(); 2195 if (storeFiles == null) { 2196 continue; 2197 } 2198 for (HStoreFile file : storeFiles) { 2199 StoreFileReader sfReader = file.getReader(); 2200 if (sfReader == null) { 2201 continue; 2202 } 2203 HFile.Reader reader = sfReader.getHFileReader(); 2204 if (reader == null) { 2205 continue; 2206 } 2207 if (majorCompactionOnly) { 2208 byte[] val = reader.getHFileInfo().get(MAJOR_COMPACTION_KEY); 2209 if (val == null || !Bytes.toBoolean(val)) { 2210 continue; 2211 } 2212 } 2213 result = Math.min(result, reader.getFileContext().getFileCreateTime()); 2214 } 2215 } 2216 return result == Long.MAX_VALUE ? 0 : result; 2217 } 2218 2219 RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) { 2220 long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId; 2221 byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes(); 2222 regionLoadBldr.clearStoreCompleteSequenceId(); 2223 for (byte[] familyName : this.stores.keySet()) { 2224 long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName); 2225 // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will 2226 // give us a sequence id that is for sure flushed. We want edit replay to start after this 2227 // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id. 2228 long csid = (earliest == HConstants.NO_SEQNUM) ? lastFlushOpSeqIdLocal : earliest - 1; 2229 regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder() 2230 .setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build()); 2231 } 2232 return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId()); 2233 } 2234 2235 ////////////////////////////////////////////////////////////////////////////// 2236 // HRegion maintenance. 2237 // 2238 // These methods are meant to be called periodically by the HRegionServer for 2239 // upkeep. 2240 ////////////////////////////////////////////////////////////////////////////// 2241 /** 2242 * Do preparation for pending compaction. 2243 */ 2244 protected void doRegionCompactionPrep() throws IOException { 2245 } 2246 2247 /** 2248 * Synchronously compact all stores in the region. 2249 * <p> 2250 * This operation could block for a long time, so don't call it from a time-sensitive thread. 2251 * <p> 2252 * Note that no locks are taken to prevent possible conflicts between compaction and splitting 2253 * activities. The regionserver does not normally compact and split in parallel. However by 2254 * calling this method you may introduce unexpected and unhandled concurrency. Don't do this 2255 * unless you know what you are doing. 2256 * @param majorCompaction True to force a major compaction regardless of thresholds 2257 */ 2258 public void compact(boolean majorCompaction) throws IOException { 2259 if (majorCompaction) { 2260 stores.values().forEach(HStore::triggerMajorCompaction); 2261 } 2262 for (HStore s : stores.values()) { 2263 Optional<CompactionContext> compaction = s.requestCompaction(); 2264 if (compaction.isPresent()) { 2265 ThroughputController controller = null; 2266 if (rsServices != null) { 2267 controller = CompactionThroughputControllerFactory.create(rsServices, conf); 2268 } 2269 if (controller == null) { 2270 controller = NoLimitThroughputController.INSTANCE; 2271 } 2272 compact(compaction.get(), s, controller, null); 2273 } 2274 } 2275 } 2276 2277 /** 2278 * This is a helper function that compact all the stores synchronously. 2279 * <p> 2280 * It is used by utilities and testing 2281 */ 2282 public void compactStores() throws IOException { 2283 for (HStore s : stores.values()) { 2284 Optional<CompactionContext> compaction = s.requestCompaction(); 2285 if (compaction.isPresent()) { 2286 compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null); 2287 } 2288 } 2289 } 2290 2291 /** 2292 * This is a helper function that compact the given store. 2293 * <p> 2294 * It is used by utilities and testing 2295 */ 2296 void compactStore(byte[] family, ThroughputController throughputController) throws IOException { 2297 HStore s = getStore(family); 2298 Optional<CompactionContext> compaction = s.requestCompaction(); 2299 if (compaction.isPresent()) { 2300 compact(compaction.get(), s, throughputController, null); 2301 } 2302 } 2303 2304 /** 2305 * Called by compaction thread and after region is opened to compact the HStores if necessary. 2306 * <p> 2307 * This operation could block for a long time, so don't call it from a time-sensitive thread. Note 2308 * that no locking is necessary at this level because compaction only conflicts with a region 2309 * split, and that cannot happen because the region server does them sequentially and not in 2310 * parallel. 2311 * @param compaction Compaction details, obtained by requestCompaction() 2312 * @return whether the compaction completed 2313 */ 2314 public boolean compact(CompactionContext compaction, HStore store, 2315 ThroughputController throughputController) throws IOException { 2316 return compact(compaction, store, throughputController, null); 2317 } 2318 2319 private boolean shouldForbidMajorCompaction() { 2320 if (rsServices != null && rsServices.getReplicationSourceService() != null) { 2321 return rsServices.getReplicationSourceService().getSyncReplicationPeerInfoProvider() 2322 .checkState(getRegionInfo().getTable(), ForbidMajorCompactionChecker.get()); 2323 } 2324 return false; 2325 } 2326 2327 /** 2328 * We are trying to remove / relax the region read lock for compaction. Let's see what are the 2329 * potential race conditions among the operations (user scan, region split, region close and 2330 * region bulk load). user scan ---> region read lock region split --> region close first --> 2331 * region write lock region close --> region write lock region bulk load --> region write lock 2332 * read lock is compatible with read lock. ---> no problem with user scan/read region bulk load 2333 * does not cause problem for compaction (no consistency problem, store lock will help the store 2334 * file accounting). They can run almost concurrently at the region level. The only remaining race 2335 * condition is between the region close and compaction. So we will evaluate, below, how region 2336 * close intervenes with compaction if compaction does not acquire region read lock. Here are the 2337 * steps for compaction: 1. obtain list of StoreFile's 2. create StoreFileScanner's based on list 2338 * from #1 3. perform compaction and save resulting files under tmp dir 4. swap in compacted files 2339 * #1 is guarded by store lock. This patch does not change this --> no worse or better For #2, we 2340 * obtain smallest read point (for region) across all the Scanners (for both default compactor and 2341 * stripe compactor). The read points are for user scans. Region keeps the read points for all 2342 * currently open user scanners. Compaction needs to know the smallest read point so that during 2343 * re-write of the hfiles, it can remove the mvcc points for the cells if their mvccs are older 2344 * than the smallest since they are not needed anymore. This will not conflict with compaction. 2345 * For #3, it can be performed in parallel to other operations. For #4 bulk load and compaction 2346 * don't conflict with each other on the region level (for multi-family atomicy). Region close and 2347 * compaction are guarded pretty well by the 'writestate'. In HRegion#doClose(), we have : 2348 * synchronized (writestate) { // Disable compacting and flushing by background threads for this 2349 * // region. canFlush = !writestate.readOnly; writestate.writesEnabled = false; 2350 * LOG.debug("Closing " + this + ": disabling compactions & flushes"); 2351 * waitForFlushesAndCompactions(); } waitForFlushesAndCompactions() would wait for 2352 * writestate.compacting to come down to 0. and in HRegion.compact() try { synchronized 2353 * (writestate) { if (writestate.writesEnabled) { wasStateSet = true; ++writestate.compacting; } 2354 * else { String msg = "NOT compacting region " + this + ". Writes disabled."; LOG.info(msg); 2355 * status.abort(msg); return false; } } Also in compactor.performCompaction(): check periodically 2356 * to see if a system stop is requested if (closeChecker != null && 2357 * closeChecker.isTimeLimit(store, now)) { progress.cancel(); return false; } if (closeChecker != 2358 * null && closeChecker.isSizeLimit(store, len)) { progress.cancel(); return false; } 2359 */ 2360 public boolean compact(CompactionContext compaction, HStore store, 2361 ThroughputController throughputController, User user) throws IOException { 2362 assert compaction != null && compaction.hasSelection(); 2363 assert !compaction.getRequest().getFiles().isEmpty(); 2364 if (this.closing.get() || this.closed.get()) { 2365 LOG.debug("Skipping compaction on " + this + " because closing/closed"); 2366 store.cancelRequestedCompaction(compaction); 2367 return false; 2368 } 2369 2370 if (compaction.getRequest().isAllFiles() && shouldForbidMajorCompaction()) { 2371 LOG.warn("Skipping major compaction on " + this 2372 + " because this cluster is transiting sync replication state" 2373 + " from STANDBY to DOWNGRADE_ACTIVE"); 2374 store.cancelRequestedCompaction(compaction); 2375 return false; 2376 } 2377 2378 MonitoredTask status = null; 2379 boolean requestNeedsCancellation = true; 2380 try { 2381 byte[] cf = Bytes.toBytes(store.getColumnFamilyName()); 2382 if (stores.get(cf) != store) { 2383 LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this 2384 + " has been re-instantiated, cancel this compaction request. " 2385 + " It may be caused by the roll back of split transaction"); 2386 return false; 2387 } 2388 2389 status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this); 2390 if (this.closed.get()) { 2391 String msg = "Skipping compaction on " + this + " because closed"; 2392 LOG.debug(msg); 2393 status.abort(msg); 2394 return false; 2395 } 2396 boolean wasStateSet = false; 2397 try { 2398 synchronized (writestate) { 2399 if (writestate.writesEnabled) { 2400 wasStateSet = true; 2401 writestate.compacting.incrementAndGet(); 2402 } else { 2403 String msg = "NOT compacting region " + this + ". Writes disabled."; 2404 LOG.info(msg); 2405 status.abort(msg); 2406 return false; 2407 } 2408 } 2409 LOG.info("Starting compaction of {} in {}{}", store, this, 2410 (compaction.getRequest().isOffPeak() ? " as an off-peak compaction" : "")); 2411 doRegionCompactionPrep(); 2412 try { 2413 status.setStatus("Compacting store " + store); 2414 // We no longer need to cancel the request on the way out of this 2415 // method because Store#compact will clean up unconditionally 2416 requestNeedsCancellation = false; 2417 store.compact(compaction, throughputController, user); 2418 } catch (InterruptedIOException iioe) { 2419 String msg = "region " + this + " compaction interrupted"; 2420 LOG.info(msg, iioe); 2421 status.abort(msg); 2422 return false; 2423 } 2424 } finally { 2425 if (wasStateSet) { 2426 synchronized (writestate) { 2427 writestate.compacting.decrementAndGet(); 2428 if (writestate.compacting.get() <= 0) { 2429 writestate.notifyAll(); 2430 } 2431 } 2432 } 2433 } 2434 status.markComplete("Compaction complete"); 2435 return true; 2436 } finally { 2437 if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction); 2438 if (status != null) { 2439 LOG.debug("Compaction status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 2440 status.prettyPrintJournal()); 2441 status.cleanup(); 2442 } 2443 } 2444 } 2445 2446 /** 2447 * Flush the cache. 2448 * <p> 2449 * When this method is called the cache will be flushed unless: 2450 * <ol> 2451 * <li>the cache is empty</li> 2452 * <li>the region is closed.</li> 2453 * <li>a flush is already in progress</li> 2454 * <li>writes are disabled</li> 2455 * </ol> 2456 * <p> 2457 * This method may block for some time, so it should not be called from a time-sensitive thread. 2458 * @param flushAllStores whether we want to force a flush of all stores 2459 * @return FlushResult indicating whether the flush was successful or not and if the region needs 2460 * compacting 2461 * @throws IOException general io exceptions because a snapshot was not properly persisted. 2462 */ 2463 // TODO HBASE-18905. We might have to expose a requestFlush API for CPs 2464 public FlushResult flush(boolean flushAllStores) throws IOException { 2465 return flushcache(flushAllStores, false, FlushLifeCycleTracker.DUMMY); 2466 } 2467 2468 public interface FlushResult { 2469 enum Result { 2470 FLUSHED_NO_COMPACTION_NEEDED, 2471 FLUSHED_COMPACTION_NEEDED, 2472 // Special case where a flush didn't run because there's nothing in the memstores. Used when 2473 // bulk loading to know when we can still load even if a flush didn't happen. 2474 CANNOT_FLUSH_MEMSTORE_EMPTY, 2475 CANNOT_FLUSH 2476 } 2477 2478 /** Returns the detailed result code */ 2479 Result getResult(); 2480 2481 /** Returns true if the memstores were flushed, else false */ 2482 boolean isFlushSucceeded(); 2483 2484 /** Returns True if the flush requested a compaction, else false */ 2485 boolean isCompactionNeeded(); 2486 } 2487 2488 public FlushResultImpl flushcache(boolean flushAllStores, boolean writeFlushRequestWalMarker, 2489 FlushLifeCycleTracker tracker) throws IOException { 2490 List<byte[]> families = null; 2491 if (flushAllStores) { 2492 families = new ArrayList<>(); 2493 families.addAll(this.getTableDescriptor().getColumnFamilyNames()); 2494 } 2495 return this.flushcache(families, writeFlushRequestWalMarker, tracker); 2496 } 2497 2498 /** 2499 * Flush the cache. When this method is called the cache will be flushed unless: 2500 * <ol> 2501 * <li>the cache is empty</li> 2502 * <li>the region is closed.</li> 2503 * <li>a flush is already in progress</li> 2504 * <li>writes are disabled</li> 2505 * </ol> 2506 * <p> 2507 * This method may block for some time, so it should not be called from a time-sensitive thread. 2508 * @param families stores of region to flush. 2509 * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL 2510 * @param tracker used to track the life cycle of this flush 2511 * @return whether the flush is success and whether the region needs compacting 2512 * @throws IOException general io exceptions 2513 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 2514 * not properly persisted. The region is put in closing mode, and 2515 * the caller MUST abort after this. 2516 */ 2517 public FlushResultImpl flushcache(List<byte[]> families, boolean writeFlushRequestWalMarker, 2518 FlushLifeCycleTracker tracker) throws IOException { 2519 // fail-fast instead of waiting on the lock 2520 if (this.closing.get()) { 2521 String msg = "Skipping flush on " + this + " because closing"; 2522 LOG.debug(msg); 2523 return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); 2524 } 2525 MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this); 2526 status.setStatus("Acquiring readlock on region"); 2527 // block waiting for the lock for flushing cache 2528 lock.readLock().lock(); 2529 boolean flushed = true; 2530 try { 2531 if (this.closed.get()) { 2532 String msg = "Skipping flush on " + this + " because closed"; 2533 LOG.debug(msg); 2534 status.abort(msg); 2535 flushed = false; 2536 return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); 2537 } 2538 if (coprocessorHost != null) { 2539 status.setStatus("Running coprocessor pre-flush hooks"); 2540 coprocessorHost.preFlush(tracker); 2541 } 2542 // TODO: this should be managed within memstore with the snapshot, updated only after flush 2543 // successful 2544 if (numMutationsWithoutWAL.sum() > 0) { 2545 numMutationsWithoutWAL.reset(); 2546 dataInMemoryWithoutWAL.reset(); 2547 } 2548 synchronized (writestate) { 2549 if (!writestate.flushing && writestate.writesEnabled) { 2550 this.writestate.flushing = true; 2551 } else { 2552 String msg = "NOT flushing " + this + " as " 2553 + (writestate.flushing ? "already flushing" : "writes are not enabled"); 2554 LOG.debug(msg); 2555 status.abort(msg); 2556 flushed = false; 2557 return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); 2558 } 2559 } 2560 2561 try { 2562 // The reason that we do not always use flushPolicy is, when the flush is 2563 // caused by logRoller, we should select stores which must be flushed 2564 // rather than could be flushed. 2565 Collection<HStore> specificStoresToFlush = null; 2566 if (families != null) { 2567 specificStoresToFlush = getSpecificStores(families); 2568 } else { 2569 specificStoresToFlush = flushPolicy.selectStoresToFlush(); 2570 } 2571 FlushResultImpl fs = 2572 internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker); 2573 2574 if (coprocessorHost != null) { 2575 status.setStatus("Running post-flush coprocessor hooks"); 2576 coprocessorHost.postFlush(tracker); 2577 } 2578 2579 if (fs.isFlushSucceeded()) { 2580 flushesQueued.reset(); 2581 } 2582 2583 status.markComplete("Flush successful " + fs.toString()); 2584 return fs; 2585 } finally { 2586 synchronized (writestate) { 2587 writestate.flushing = false; 2588 this.writestate.flushRequested = false; 2589 writestate.notifyAll(); 2590 } 2591 } 2592 } finally { 2593 lock.readLock().unlock(); 2594 if (flushed) { 2595 // Don't log this journal stuff if no flush -- confusing. 2596 LOG.debug("Flush status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 2597 status.prettyPrintJournal()); 2598 } 2599 status.cleanup(); 2600 } 2601 } 2602 2603 /** 2604 * get stores which matches the specified families 2605 * @return the stores need to be flushed. 2606 */ 2607 private Collection<HStore> getSpecificStores(List<byte[]> families) { 2608 Collection<HStore> specificStoresToFlush = new ArrayList<>(); 2609 for (byte[] family : families) { 2610 specificStoresToFlush.add(stores.get(family)); 2611 } 2612 return specificStoresToFlush; 2613 } 2614 2615 /** 2616 * Should the store be flushed because it is old enough. 2617 * <p> 2618 * Every FlushPolicy should call this to determine whether a store is old enough to flush (except 2619 * that you always flush all stores). Otherwise the method will always returns true which will 2620 * make a lot of flush requests. 2621 */ 2622 boolean shouldFlushStore(HStore store) { 2623 long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), 2624 store.getColumnFamilyDescriptor().getName()) - 1; 2625 if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) { 2626 if (LOG.isDebugEnabled()) { 2627 LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " 2628 + getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest 2629 + " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint()); 2630 } 2631 return true; 2632 } 2633 if (this.flushCheckInterval <= 0) { 2634 return false; 2635 } 2636 long now = EnvironmentEdgeManager.currentTime(); 2637 if (store.timeOfOldestEdit() < now - this.flushCheckInterval) { 2638 if (LOG.isDebugEnabled()) { 2639 LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " 2640 + getRegionInfo().getEncodedName() + " because time of oldest edit=" 2641 + store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now); 2642 } 2643 return true; 2644 } 2645 return false; 2646 } 2647 2648 /** 2649 * Should the memstore be flushed now 2650 */ 2651 boolean shouldFlush(final StringBuilder whyFlush) { 2652 whyFlush.setLength(0); 2653 // This is a rough measure. 2654 if ( 2655 this.maxFlushedSeqId > 0 2656 && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint()) 2657 ) { 2658 whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush"); 2659 return true; 2660 } 2661 long modifiedFlushCheckInterval = flushCheckInterval; 2662 if ( 2663 getRegionInfo().getTable().isSystemTable() 2664 && getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID 2665 ) { 2666 modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL; 2667 } 2668 if (modifiedFlushCheckInterval <= 0) { // disabled 2669 return false; 2670 } 2671 long now = EnvironmentEdgeManager.currentTime(); 2672 // if we flushed in the recent past, we don't need to do again now 2673 if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) { 2674 return false; 2675 } 2676 // since we didn't flush in the recent past, flush now if certain conditions 2677 // are met. Return true on first such memstore hit. 2678 for (HStore s : stores.values()) { 2679 if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) { 2680 // we have an old enough edit in the memstore, flush 2681 whyFlush.append(s.toString() + " has an old edit so flush to free WALs"); 2682 return true; 2683 } 2684 } 2685 return false; 2686 } 2687 2688 /** 2689 * Flushing all stores. 2690 * @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker) 2691 */ 2692 private FlushResult internalFlushcache(MonitoredTask status) throws IOException { 2693 return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY); 2694 } 2695 2696 /** 2697 * Flushing given stores. 2698 * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker) 2699 */ 2700 private FlushResultImpl internalFlushcache(Collection<HStore> storesToFlush, MonitoredTask status, 2701 boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { 2702 return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status, 2703 writeFlushWalMarker, tracker); 2704 } 2705 2706 /** 2707 * Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the 2708 * memstore, all of which have also been written to the wal. We need to write those updates in the 2709 * memstore out to disk, while being able to process reads/writes as much as possible during the 2710 * flush operation. 2711 * <p> 2712 * This method may block for some time. Every time you call it, we up the regions sequence id even 2713 * if we don't flush; i.e. the returned region id will be at least one larger than the last edit 2714 * applied to this region. The returned id does not refer to an actual edit. The returned id can 2715 * be used for say installing a bulk loaded file just ahead of the last hfile that was the result 2716 * of this flush, etc. 2717 * @param wal Null if we're NOT to go via wal. 2718 * @param myseqid The seqid to use if <code>wal</code> is null writing out flush file. 2719 * @param storesToFlush The list of stores to flush. 2720 * @return object describing the flush's state 2721 * @throws IOException general io exceptions 2722 * @throws DroppedSnapshotException Thrown when replay of WAL is required. 2723 */ 2724 protected FlushResultImpl internalFlushcache(WAL wal, long myseqid, 2725 Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, 2726 FlushLifeCycleTracker tracker) throws IOException { 2727 PrepareFlushResult result = 2728 internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker); 2729 if (result.result == null) { 2730 return internalFlushCacheAndCommit(wal, status, result, storesToFlush); 2731 } else { 2732 return result.result; // early exit due to failure from prepare stage 2733 } 2734 } 2735 2736 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DLS_DEAD_LOCAL_STORE", 2737 justification = "FindBugs seems confused about trxId") 2738 protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid, 2739 Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, 2740 FlushLifeCycleTracker tracker) throws IOException { 2741 if (this.rsServices != null && this.rsServices.isAborted()) { 2742 // Don't flush when server aborting, it's unsafe 2743 throw new IOException("Aborting flush because server is aborted..."); 2744 } 2745 final long startTime = EnvironmentEdgeManager.currentTime(); 2746 // If nothing to flush, return, but return with a valid unused sequenceId. 2747 // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a 2748 // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs 2749 // to no other that it can use to associate with the bulk load. Hence this little dance below 2750 // to go get one. 2751 if (this.memStoreSizing.getDataSize() <= 0) { 2752 // Take an update lock so no edits can come into memory just yet. 2753 this.updatesLock.writeLock().lock(); 2754 WriteEntry writeEntry = null; 2755 try { 2756 if (this.memStoreSizing.getDataSize() <= 0) { 2757 // Presume that if there are still no edits in the memstore, then there are no edits for 2758 // this region out in the WAL subsystem so no need to do any trickery clearing out 2759 // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for 2760 // sure just beyond the last appended region edit and not associated with any edit 2761 // (useful as marker when bulk loading, etc.). 2762 if (wal != null) { 2763 writeEntry = mvcc.begin(); 2764 long flushOpSeqId = writeEntry.getWriteNumber(); 2765 FlushResultImpl flushResult = new FlushResultImpl( 2766 FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, "Nothing to flush", 2767 writeCanNotFlushMarkerToWAL(writeEntry, wal, writeFlushWalMarker)); 2768 mvcc.completeAndWait(writeEntry); 2769 // Set to null so we don't complete it again down in finally block. 2770 writeEntry = null; 2771 return new PrepareFlushResult(flushResult, myseqid); 2772 } else { 2773 return new PrepareFlushResult(new FlushResultImpl( 2774 FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid); 2775 } 2776 } 2777 } finally { 2778 if (writeEntry != null) { 2779 // If writeEntry is non-null, this operation failed; the mvcc transaction failed... 2780 // but complete it anyways so it doesn't block the mvcc queue. 2781 mvcc.complete(writeEntry); 2782 } 2783 this.updatesLock.writeLock().unlock(); 2784 } 2785 } 2786 logFatLineOnFlush(storesToFlush, myseqid); 2787 // Stop updates while we snapshot the memstore of all of these regions' stores. We only have 2788 // to do this for a moment. It is quick. We also set the memstore size to zero here before we 2789 // allow updates again so its value will represent the size of the updates received 2790 // during flush 2791 2792 // We have to take an update lock during snapshot, or else a write could end up in both snapshot 2793 // and memstore (makes it difficult to do atomic rows then) 2794 status.setStatus("Obtaining lock to block concurrent updates"); 2795 // block waiting for the lock for internal flush 2796 this.updatesLock.writeLock().lock(); 2797 status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName()); 2798 MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing(); 2799 2800 Map<byte[], Long> flushedFamilyNamesToSeq = new HashMap<>(); 2801 for (HStore store : storesToFlush) { 2802 flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(), 2803 store.preFlushSeqIDEstimation()); 2804 } 2805 2806 TreeMap<byte[], StoreFlushContext> storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR); 2807 TreeMap<byte[], List<Path>> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); 2808 TreeMap<byte[], MemStoreSize> storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR); 2809 // The sequence id of this flush operation which is used to log FlushMarker and pass to 2810 // createFlushContext to use as the store file's sequence id. It can be in advance of edits 2811 // still in the memstore, edits that are in other column families yet to be flushed. 2812 long flushOpSeqId = HConstants.NO_SEQNUM; 2813 // The max flushed sequence id after this flush operation completes. All edits in memstore 2814 // will be in advance of this sequence id. 2815 long flushedSeqId = HConstants.NO_SEQNUM; 2816 byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes(); 2817 try { 2818 if (wal != null) { 2819 Long earliestUnflushedSequenceIdForTheRegion = 2820 wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq); 2821 if (earliestUnflushedSequenceIdForTheRegion == null) { 2822 // This should never happen. This is how startCacheFlush signals flush cannot proceed. 2823 String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing."; 2824 status.setStatus(msg); 2825 return new PrepareFlushResult( 2826 new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false), myseqid); 2827 } 2828 flushOpSeqId = getNextSequenceId(wal); 2829 // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit 2830 flushedSeqId = earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM 2831 ? flushOpSeqId 2832 : earliestUnflushedSequenceIdForTheRegion.longValue() - 1; 2833 } else { 2834 // use the provided sequence Id as WAL is not being used for this flush. 2835 flushedSeqId = flushOpSeqId = myseqid; 2836 } 2837 2838 for (HStore s : storesToFlush) { 2839 storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(), 2840 s.createFlushContext(flushOpSeqId, tracker)); 2841 // for writing stores to WAL 2842 committedFiles.put(s.getColumnFamilyDescriptor().getName(), null); 2843 } 2844 2845 // write the snapshot start to WAL 2846 if (wal != null && !writestate.readOnly) { 2847 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH, 2848 getRegionInfo(), flushOpSeqId, committedFiles); 2849 // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH 2850 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, 2851 mvcc, regionReplicationSink.orElse(null)); 2852 } 2853 2854 // Prepare flush (take a snapshot) 2855 storeFlushCtxs.forEach((name, flush) -> { 2856 MemStoreSize snapshotSize = flush.prepare(); 2857 totalSizeOfFlushableStores.incMemStoreSize(snapshotSize); 2858 storeFlushableSize.put(name, snapshotSize); 2859 }); 2860 } catch (IOException ex) { 2861 doAbortFlushToWAL(wal, flushOpSeqId, committedFiles); 2862 throw ex; 2863 } finally { 2864 this.updatesLock.writeLock().unlock(); 2865 } 2866 String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " 2867 + "flushsize=" + totalSizeOfFlushableStores; 2868 status.setStatus(s); 2869 doSyncOfUnflushedWALChanges(wal, getRegionInfo()); 2870 return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime, 2871 flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores); 2872 } 2873 2874 /** 2875 * Utility method broken out of internalPrepareFlushCache so that method is smaller. 2876 */ 2877 private void logFatLineOnFlush(Collection<HStore> storesToFlush, long sequenceId) { 2878 if (!LOG.isInfoEnabled()) { 2879 return; 2880 } 2881 // Log a fat line detailing what is being flushed. 2882 StringBuilder perCfExtras = null; 2883 if (!isAllFamilies(storesToFlush)) { 2884 perCfExtras = new StringBuilder(); 2885 for (HStore store : storesToFlush) { 2886 MemStoreSize mss = store.getFlushableSize(); 2887 perCfExtras.append("; ").append(store.getColumnFamilyName()); 2888 perCfExtras.append("={dataSize=").append(StringUtils.byteDesc(mss.getDataSize())); 2889 perCfExtras.append(", heapSize=").append(StringUtils.byteDesc(mss.getHeapSize())); 2890 perCfExtras.append(", offHeapSize=").append(StringUtils.byteDesc(mss.getOffHeapSize())); 2891 perCfExtras.append("}"); 2892 } 2893 } 2894 MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); 2895 LOG.info("Flushing " + this.getRegionInfo().getEncodedName() + " " + storesToFlush.size() + "/" 2896 + stores.size() + " column families," + " dataSize=" + StringUtils.byteDesc(mss.getDataSize()) 2897 + " heapSize=" + StringUtils.byteDesc(mss.getHeapSize()) 2898 + ((perCfExtras != null && perCfExtras.length() > 0) ? perCfExtras.toString() : "") 2899 + ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId)); 2900 } 2901 2902 private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId, 2903 final Map<byte[], List<Path>> committedFiles) { 2904 if (wal == null) return; 2905 try { 2906 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, 2907 getRegionInfo(), flushOpSeqId, committedFiles); 2908 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc, 2909 null); 2910 } catch (Throwable t) { 2911 LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL: {} in " 2912 + " region {}", StringUtils.stringifyException(t), this); 2913 // ignore this since we will be aborting the RS with DSE. 2914 } 2915 // we have called wal.startCacheFlush(), now we have to abort it 2916 wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); 2917 } 2918 2919 /** 2920 * Sync unflushed WAL changes. See HBASE-8208 for details 2921 */ 2922 private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri) 2923 throws IOException { 2924 if (wal == null) { 2925 return; 2926 } 2927 try { 2928 wal.sync(); // ensure that flush marker is sync'ed 2929 } catch (IOException ioe) { 2930 wal.abortCacheFlush(hri.getEncodedNameAsBytes()); 2931 throw ioe; 2932 } 2933 } 2934 2935 /** Returns True if passed Set is all families in the region. */ 2936 private boolean isAllFamilies(Collection<HStore> families) { 2937 return families == null || this.stores.size() == families.size(); 2938 } 2939 2940 /** 2941 * This method is only used when we flush but the memstore is empty,if writeFlushWalMarker is 2942 * true,we write the {@link FlushAction#CANNOT_FLUSH} flush marker to WAL when the memstore is 2943 * empty. Ignores exceptions from WAL. Returns whether the write succeeded. 2944 * @return whether WAL write was successful 2945 */ 2946 private boolean writeCanNotFlushMarkerToWAL(WriteEntry flushOpSeqIdMVCCEntry, WAL wal, 2947 boolean writeFlushWalMarker) { 2948 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH, getRegionInfo(), 2949 -1, new TreeMap<>(Bytes.BYTES_COMPARATOR)); 2950 RegionReplicationSink sink = regionReplicationSink.orElse(null); 2951 2952 if (sink != null && !writeFlushWalMarker) { 2953 /** 2954 * Here for replication to secondary region replica could use {@link FlushAction#CANNOT_FLUSH} 2955 * to recover when writeFlushWalMarker is false, we create {@link WALEdit} for 2956 * {@link FlushDescriptor} and attach the {@link RegionReplicationSink#add} to the 2957 * flushOpSeqIdMVCCEntry,see HBASE-26960 for more details. 2958 */ 2959 this.attachRegionReplicationToFlushOpSeqIdMVCCEntry(flushOpSeqIdMVCCEntry, desc, sink); 2960 return false; 2961 } 2962 2963 if (writeFlushWalMarker && wal != null && !writestate.readOnly) { 2964 try { 2965 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, mvcc, 2966 sink); 2967 return true; 2968 } catch (IOException e) { 2969 LOG.warn(getRegionInfo().getEncodedName() + " : " 2970 + "Received exception while trying to write the flush request to wal", e); 2971 } 2972 } 2973 return false; 2974 } 2975 2976 /** 2977 * Create {@link WALEdit} for {@link FlushDescriptor} and attach {@link RegionReplicationSink#add} 2978 * to the flushOpSeqIdMVCCEntry. 2979 */ 2980 private void attachRegionReplicationToFlushOpSeqIdMVCCEntry(WriteEntry flushOpSeqIdMVCCEntry, 2981 FlushDescriptor desc, RegionReplicationSink sink) { 2982 assert !flushOpSeqIdMVCCEntry.getCompletionAction().isPresent(); 2983 WALEdit flushMarkerWALEdit = WALEdit.createFlushWALEdit(getRegionInfo(), desc); 2984 WALKeyImpl walKey = 2985 WALUtil.createWALKey(getRegionInfo(), mvcc, this.getReplicationScope(), null); 2986 walKey.setWriteEntry(flushOpSeqIdMVCCEntry); 2987 /** 2988 * Here the {@link ServerCall} is null for {@link RegionReplicationSink#add} because the 2989 * flushMarkerWALEdit is created by ourselves, not from rpc. 2990 */ 2991 flushOpSeqIdMVCCEntry.attachCompletionAction(() -> sink.add(walKey, flushMarkerWALEdit, null)); 2992 } 2993 2994 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 2995 justification = "Intentional; notify is about completed flush") 2996 FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status, 2997 PrepareFlushResult prepareResult, Collection<HStore> storesToFlush) throws IOException { 2998 // prepare flush context is carried via PrepareFlushResult 2999 TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs; 3000 TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles; 3001 long startTime = prepareResult.startTime; 3002 long flushOpSeqId = prepareResult.flushOpSeqId; 3003 long flushedSeqId = prepareResult.flushedSeqId; 3004 3005 String s = "Flushing stores of " + this; 3006 status.setStatus(s); 3007 if (LOG.isTraceEnabled()) LOG.trace(s); 3008 3009 // Any failure from here on out will be catastrophic requiring server 3010 // restart so wal content can be replayed and put back into the memstore. 3011 // Otherwise, the snapshot content while backed up in the wal, it will not 3012 // be part of the current running servers state. 3013 boolean compactionRequested = false; 3014 long flushedOutputFileSize = 0; 3015 try { 3016 // A. Flush memstore to all the HStores. 3017 // Keep running vector of all store files that includes both old and the 3018 // just-made new flush store file. The new flushed file is still in the 3019 // tmp directory. 3020 3021 for (StoreFlushContext flush : storeFlushCtxs.values()) { 3022 flush.flushCache(status); 3023 } 3024 3025 // Switch snapshot (in memstore) -> new hfile (thus causing 3026 // all the store scanners to reset/reseek). 3027 for (Map.Entry<byte[], StoreFlushContext> flushEntry : storeFlushCtxs.entrySet()) { 3028 StoreFlushContext sfc = flushEntry.getValue(); 3029 boolean needsCompaction = sfc.commit(status); 3030 if (needsCompaction) { 3031 compactionRequested = true; 3032 } 3033 byte[] storeName = flushEntry.getKey(); 3034 List<Path> storeCommittedFiles = sfc.getCommittedFiles(); 3035 committedFiles.put(storeName, storeCommittedFiles); 3036 // Flush committed no files, indicating flush is empty or flush was canceled 3037 if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) { 3038 MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName); 3039 prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize); 3040 } 3041 flushedOutputFileSize += sfc.getOutputFileSize(); 3042 } 3043 storeFlushCtxs.clear(); 3044 3045 // Set down the memstore size by amount of flush. 3046 MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); 3047 this.decrMemStoreSize(mss); 3048 3049 // Increase the size of this Region for the purposes of quota. Noop if quotas are disabled. 3050 // During startup, quota manager may not be initialized yet. 3051 if (rsServices != null) { 3052 RegionServerSpaceQuotaManager quotaManager = rsServices.getRegionServerSpaceQuotaManager(); 3053 if (quotaManager != null) { 3054 quotaManager.getRegionSizeStore().incrementRegionSize(this.getRegionInfo(), 3055 flushedOutputFileSize); 3056 } 3057 } 3058 3059 if (wal != null) { 3060 // write flush marker to WAL. If fail, we should throw DroppedSnapshotException 3061 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH, 3062 getRegionInfo(), flushOpSeqId, committedFiles); 3063 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, mvcc, 3064 regionReplicationSink.orElse(null)); 3065 } 3066 } catch (Throwable t) { 3067 // An exception here means that the snapshot was not persisted. 3068 // The wal needs to be replayed so its content is restored to memstore. 3069 // Currently, only a server restart will do this. 3070 // We used to only catch IOEs but its possible that we'd get other 3071 // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch 3072 // all and sundry. 3073 if (wal != null) { 3074 try { 3075 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, 3076 getRegionInfo(), flushOpSeqId, committedFiles); 3077 WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc, 3078 null); 3079 } catch (Throwable ex) { 3080 LOG.warn( 3081 getRegionInfo().getEncodedName() + " : " + "failed writing ABORT_FLUSH marker to WAL", 3082 ex); 3083 // ignore this since we will be aborting the RS with DSE. 3084 } 3085 wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); 3086 } 3087 DroppedSnapshotException dse = new DroppedSnapshotException( 3088 "region: " + Bytes.toStringBinary(getRegionInfo().getRegionName()), t); 3089 status.abort("Flush failed: " + StringUtils.stringifyException(t)); 3090 3091 // Callers for flushcache() should catch DroppedSnapshotException and abort the region server. 3092 // However, since we may have the region read lock, we cannot call close(true) here since 3093 // we cannot promote to a write lock. Instead we are setting closing so that all other region 3094 // operations except for close will be rejected. 3095 this.closing.set(true); 3096 3097 if (rsServices != null) { 3098 // This is a safeguard against the case where the caller fails to explicitly handle aborting 3099 rsServices.abort("Replay of WAL required. Forcing server shutdown", dse); 3100 } 3101 3102 throw dse; 3103 } 3104 3105 // If we get to here, the HStores have been written. 3106 if (wal != null) { 3107 wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes(), flushedSeqId); 3108 } 3109 3110 // Record latest flush time 3111 for (HStore store : storesToFlush) { 3112 this.lastStoreFlushTimeMap.put(store, startTime); 3113 } 3114 3115 this.maxFlushedSeqId = flushedSeqId; 3116 this.lastFlushOpSeqId = flushOpSeqId; 3117 3118 // C. Finally notify anyone waiting on memstore to clear: 3119 // e.g. checkResources(). 3120 synchronized (this) { 3121 notifyAll(); // FindBugs NN_NAKED_NOTIFY 3122 } 3123 3124 long time = EnvironmentEdgeManager.currentTime() - startTime; 3125 MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); 3126 long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize(); 3127 String msg = "Finished flush of" + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/" 3128 + mss.getDataSize() + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/" 3129 + mss.getHeapSize() + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/" 3130 + memstoresize + " for " + this.getRegionInfo().getEncodedName() + " in " + time 3131 + "ms, sequenceid=" + flushOpSeqId + ", compaction requested=" + compactionRequested 3132 + ((wal == null) ? "; wal=null" : ""); 3133 LOG.info(msg); 3134 status.setStatus(msg); 3135 3136 if (rsServices != null && rsServices.getMetrics() != null) { 3137 rsServices.getMetrics().updateFlush(getTableDescriptor().getTableName().getNameAsString(), 3138 time, mss.getDataSize(), flushedOutputFileSize); 3139 } 3140 3141 return new FlushResultImpl(compactionRequested 3142 ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED 3143 : FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId); 3144 } 3145 3146 /** 3147 * Method to safely get the next sequence number. 3148 * @return Next sequence number unassociated with any actual edit. 3149 */ 3150 protected long getNextSequenceId(final WAL wal) throws IOException { 3151 WriteEntry we = mvcc.begin(); 3152 mvcc.completeAndWait(we); 3153 return we.getWriteNumber(); 3154 } 3155 3156 ////////////////////////////////////////////////////////////////////////////// 3157 // get() methods for client use. 3158 ////////////////////////////////////////////////////////////////////////////// 3159 3160 @Override 3161 public RegionScannerImpl getScanner(Scan scan) throws IOException { 3162 return getScanner(scan, null); 3163 } 3164 3165 @Override 3166 public RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners) 3167 throws IOException { 3168 return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE); 3169 } 3170 3171 private RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners, 3172 long nonceGroup, long nonce) throws IOException { 3173 return TraceUtil.trace(() -> { 3174 startRegionOperation(Operation.SCAN); 3175 try { 3176 // Verify families are all valid 3177 if (!scan.hasFamilies()) { 3178 // Adding all families to scanner 3179 for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { 3180 scan.addFamily(family); 3181 } 3182 } else { 3183 for (byte[] family : scan.getFamilyMap().keySet()) { 3184 checkFamily(family); 3185 } 3186 } 3187 return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce); 3188 } finally { 3189 closeRegionOperation(Operation.SCAN); 3190 } 3191 }, () -> createRegionSpan("Region.getScanner")); 3192 } 3193 3194 protected RegionScannerImpl instantiateRegionScanner(Scan scan, 3195 List<KeyValueScanner> additionalScanners, long nonceGroup, long nonce) throws IOException { 3196 if (scan.isReversed()) { 3197 if (scan.getFilter() != null) { 3198 scan.getFilter().setReversed(true); 3199 } 3200 return new ReversedRegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); 3201 } 3202 return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); 3203 } 3204 3205 /** 3206 * Prepare a delete for a row mutation processor 3207 * @param delete The passed delete is modified by this method. WARNING! 3208 */ 3209 private void prepareDelete(Delete delete) throws IOException { 3210 // Check to see if this is a deleteRow insert 3211 if (delete.getFamilyCellMap().isEmpty()) { 3212 for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { 3213 // Don't eat the timestamp 3214 delete.addFamily(family, delete.getTimestamp()); 3215 } 3216 } else { 3217 for (byte[] family : delete.getFamilyCellMap().keySet()) { 3218 if (family == null) { 3219 throw new NoSuchColumnFamilyException("Empty family is invalid"); 3220 } 3221 checkFamily(family, delete.getDurability()); 3222 } 3223 } 3224 } 3225 3226 @Override 3227 public void delete(Delete delete) throws IOException { 3228 TraceUtil.trace(() -> { 3229 checkReadOnly(); 3230 checkResources(); 3231 startRegionOperation(Operation.DELETE); 3232 try { 3233 // All edits for the given row (across all column families) must happen atomically. 3234 return mutate(delete); 3235 } finally { 3236 closeRegionOperation(Operation.DELETE); 3237 } 3238 }, () -> createRegionSpan("Region.delete")); 3239 } 3240 3241 /** 3242 * Set up correct timestamps in the KVs in Delete object. 3243 * <p/> 3244 * Caller should have the row and region locks. 3245 */ 3246 private void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap, 3247 byte[] byteNow) throws IOException { 3248 for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { 3249 3250 byte[] family = e.getKey(); 3251 List<Cell> cells = e.getValue(); 3252 assert cells instanceof RandomAccess; 3253 3254 Map<byte[], Integer> kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR); 3255 int listSize = cells.size(); 3256 for (int i = 0; i < listSize; i++) { 3257 Cell cell = cells.get(i); 3258 // Check if time is LATEST, change to time of most recent addition if so 3259 // This is expensive. 3260 if ( 3261 cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && PrivateCellUtil.isDeleteType(cell) 3262 ) { 3263 byte[] qual = CellUtil.cloneQualifier(cell); 3264 3265 Integer count = kvCount.get(qual); 3266 if (count == null) { 3267 kvCount.put(qual, 1); 3268 } else { 3269 kvCount.put(qual, count + 1); 3270 } 3271 count = kvCount.get(qual); 3272 3273 Get get = new Get(CellUtil.cloneRow(cell)); 3274 get.readVersions(count); 3275 get.addColumn(family, qual); 3276 if (coprocessorHost != null) { 3277 if ( 3278 !coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell, byteNow, get) 3279 ) { 3280 updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); 3281 } 3282 } else { 3283 updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); 3284 } 3285 } else { 3286 PrivateCellUtil.updateLatestStamp(cell, byteNow); 3287 } 3288 } 3289 } 3290 } 3291 3292 private void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow) 3293 throws IOException { 3294 try (RegionScanner scanner = getScanner(new Scan(get))) { 3295 // NOTE: Please don't use HRegion.get() instead, 3296 // because it will copy cells to heap. See HBASE-26036 3297 List<Cell> result = new ArrayList<>(); 3298 scanner.next(result); 3299 3300 if (result.size() < count) { 3301 // Nothing to delete 3302 PrivateCellUtil.updateLatestStamp(cell, byteNow); 3303 return; 3304 } 3305 if (result.size() > count) { 3306 throw new RuntimeException("Unexpected size: " + result.size()); 3307 } 3308 Cell getCell = result.get(count - 1); 3309 PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp()); 3310 } 3311 } 3312 3313 @Override 3314 public void put(Put put) throws IOException { 3315 TraceUtil.trace(() -> { 3316 checkReadOnly(); 3317 3318 // Do a rough check that we have resources to accept a write. The check is 3319 // 'rough' in that between the resource check and the call to obtain a 3320 // read lock, resources may run out. For now, the thought is that this 3321 // will be extremely rare; we'll deal with it when it happens. 3322 checkResources(); 3323 startRegionOperation(Operation.PUT); 3324 try { 3325 // All edits for the given row (across all column families) must happen atomically. 3326 return mutate(put); 3327 } finally { 3328 closeRegionOperation(Operation.PUT); 3329 } 3330 }, () -> createRegionSpan("Region.put")); 3331 } 3332 3333 /** 3334 * Class that tracks the progress of a batch operations, accumulating status codes and tracking 3335 * the index at which processing is proceeding. These batch operations may get split into 3336 * mini-batches for processing. 3337 */ 3338 private abstract static class BatchOperation<T> { 3339 protected final T[] operations; 3340 protected final OperationStatus[] retCodeDetails; 3341 protected final WALEdit[] walEditsFromCoprocessors; 3342 // reference family cell maps directly so coprocessors can mutate them if desired 3343 protected final Map<byte[], List<Cell>>[] familyCellMaps; 3344 // For Increment/Append operations 3345 protected final Result[] results; 3346 3347 protected final HRegion region; 3348 protected int nextIndexToProcess = 0; 3349 protected final ObservedExceptionsInBatch observedExceptions; 3350 // Durability of the batch (highest durability of all operations) 3351 protected Durability durability; 3352 protected boolean atomic = false; 3353 3354 public BatchOperation(final HRegion region, T[] operations) { 3355 this.operations = operations; 3356 this.retCodeDetails = new OperationStatus[operations.length]; 3357 Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN); 3358 this.walEditsFromCoprocessors = new WALEdit[operations.length]; 3359 familyCellMaps = new Map[operations.length]; 3360 this.results = new Result[operations.length]; 3361 3362 this.region = region; 3363 observedExceptions = new ObservedExceptionsInBatch(); 3364 durability = Durability.USE_DEFAULT; 3365 } 3366 3367 /** 3368 * Visitor interface for batch operations 3369 */ 3370 @FunctionalInterface 3371 interface Visitor { 3372 /** 3373 * @param index operation index 3374 * @return If true continue visiting remaining entries, break otherwise 3375 */ 3376 boolean visit(int index) throws IOException; 3377 } 3378 3379 /** 3380 * Helper method for visiting pending/ all batch operations 3381 */ 3382 public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor) 3383 throws IOException { 3384 assert lastIndexExclusive <= this.size(); 3385 for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) { 3386 if (!pendingOnly || isOperationPending(i)) { 3387 if (!visitor.visit(i)) { 3388 break; 3389 } 3390 } 3391 } 3392 } 3393 3394 public abstract Mutation getMutation(int index); 3395 3396 public abstract long getNonceGroup(int index); 3397 3398 public abstract long getNonce(int index); 3399 3400 /** 3401 * This method is potentially expensive and useful mostly for non-replay CP path. 3402 */ 3403 public abstract Mutation[] getMutationsForCoprocs(); 3404 3405 public abstract boolean isInReplay(); 3406 3407 public abstract long getOrigLogSeqNum(); 3408 3409 public abstract void startRegionOperation() throws IOException; 3410 3411 public abstract void closeRegionOperation() throws IOException; 3412 3413 /** 3414 * Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs 3415 * CP prePut()/preDelete()/preIncrement()/preAppend() hooks for all mutations in a batch. This 3416 * is intended to operate on entire batch and will be called from outside of class to check and 3417 * prepare batch. This can be implemented by calling helper method 3418 * {@link #checkAndPrepareMutation(int, long)} in a 'for' loop over mutations. 3419 */ 3420 public abstract void checkAndPrepare() throws IOException; 3421 3422 /** 3423 * Implement any Put request specific check and prepare logic here. Please refer to 3424 * {@link #checkAndPrepareMutation(Mutation, long)} for how its used. 3425 */ 3426 protected abstract void checkAndPreparePut(final Put p) throws IOException; 3427 3428 /** 3429 * If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell 3430 * count, tags and timestamp for all cells of all operations in a mini-batch. 3431 */ 3432 public abstract void prepareMiniBatchOperations( 3433 MiniBatchOperationInProgress<Mutation> miniBatchOp, long timestamp, 3434 final List<RowLock> acquiredRowLocks) throws IOException; 3435 3436 /** 3437 * Write mini-batch operations to MemStore 3438 */ 3439 public abstract WriteEntry writeMiniBatchOperationsToMemStore( 3440 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry, 3441 long now) throws IOException; 3442 3443 protected void writeMiniBatchOperationsToMemStore( 3444 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final long writeNumber) 3445 throws IOException { 3446 MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing(); 3447 visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { 3448 // We need to update the sequence id for following reasons. 3449 // 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id. 3450 // 2) If no WAL, FSWALEntry won't be used 3451 // we use durability of the original mutation for the mutation passed by CP. 3452 if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) { 3453 region.updateSequenceId(familyCellMaps[index].values(), writeNumber); 3454 } 3455 applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting); 3456 return true; 3457 }); 3458 // update memStore size 3459 region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(), 3460 memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount()); 3461 } 3462 3463 public boolean isDone() { 3464 return nextIndexToProcess == operations.length; 3465 } 3466 3467 public int size() { 3468 return operations.length; 3469 } 3470 3471 public boolean isOperationPending(int index) { 3472 return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN; 3473 } 3474 3475 public List<UUID> getClusterIds() { 3476 assert size() != 0; 3477 return getMutation(0).getClusterIds(); 3478 } 3479 3480 boolean isAtomic() { 3481 return atomic; 3482 } 3483 3484 /** 3485 * Helper method that checks and prepares only one mutation. This can be used to implement 3486 * {@link #checkAndPrepare()} for entire Batch. NOTE: As CP 3487 * prePut()/preDelete()/preIncrement()/preAppend() hooks may modify mutations, this method 3488 * should be called after prePut()/preDelete()/preIncrement()/preAppend() CP hooks are run for 3489 * the mutation 3490 */ 3491 protected void checkAndPrepareMutation(Mutation mutation, final long timestamp) 3492 throws IOException { 3493 region.checkRow(mutation.getRow(), "batchMutate"); 3494 if (mutation instanceof Put) { 3495 // Check the families in the put. If bad, skip this one. 3496 checkAndPreparePut((Put) mutation); 3497 region.checkTimestamps(mutation.getFamilyCellMap(), timestamp); 3498 } else if (mutation instanceof Delete) { 3499 region.prepareDelete((Delete) mutation); 3500 } else if (mutation instanceof Increment || mutation instanceof Append) { 3501 region.checkFamilies(mutation.getFamilyCellMap().keySet(), mutation.getDurability()); 3502 } 3503 } 3504 3505 protected void checkAndPrepareMutation(int index, long timestamp) throws IOException { 3506 Mutation mutation = getMutation(index); 3507 try { 3508 this.checkAndPrepareMutation(mutation, timestamp); 3509 3510 if (mutation instanceof Put || mutation instanceof Delete) { 3511 // store the family map reference to allow for mutations 3512 familyCellMaps[index] = mutation.getFamilyCellMap(); 3513 } 3514 3515 // store durability for the batch (highest durability of all operations in the batch) 3516 Durability tmpDur = region.getEffectiveDurability(mutation.getDurability()); 3517 if (tmpDur.ordinal() > durability.ordinal()) { 3518 durability = tmpDur; 3519 } 3520 } catch (NoSuchColumnFamilyException nscfe) { 3521 final String msg = "No such column family in batch mutation in region " + this; 3522 if (observedExceptions.hasSeenNoSuchFamily()) { 3523 LOG.warn(msg + nscfe.getMessage()); 3524 } else { 3525 LOG.warn(msg, nscfe); 3526 observedExceptions.sawNoSuchFamily(); 3527 } 3528 retCodeDetails[index] = 3529 new OperationStatus(OperationStatusCode.BAD_FAMILY, nscfe.getMessage()); 3530 if (isAtomic()) { // fail, atomic means all or none 3531 throw nscfe; 3532 } 3533 } catch (FailedSanityCheckException fsce) { 3534 final String msg = "Batch Mutation did not pass sanity check in region " + this; 3535 if (observedExceptions.hasSeenFailedSanityCheck()) { 3536 LOG.warn(msg + fsce.getMessage()); 3537 } else { 3538 LOG.warn(msg, fsce); 3539 observedExceptions.sawFailedSanityCheck(); 3540 } 3541 retCodeDetails[index] = 3542 new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); 3543 if (isAtomic()) { 3544 throw fsce; 3545 } 3546 } catch (WrongRegionException we) { 3547 final String msg = "Batch mutation had a row that does not belong to this region " + this; 3548 if (observedExceptions.hasSeenWrongRegion()) { 3549 LOG.warn(msg + we.getMessage()); 3550 } else { 3551 LOG.warn(msg, we); 3552 observedExceptions.sawWrongRegion(); 3553 } 3554 retCodeDetails[index] = 3555 new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); 3556 if (isAtomic()) { 3557 throw we; 3558 } 3559 } 3560 } 3561 3562 /** 3563 * Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which a row 3564 * lock can be acquired. All mutations with locked rows are considered to be In-progress 3565 * operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch is window over 3566 * {@link BatchOperation} and contains contiguous pending operations. 3567 * @param acquiredRowLocks keeps track of rowLocks acquired. 3568 */ 3569 public MiniBatchOperationInProgress<Mutation> 3570 lockRowsAndBuildMiniBatch(List<RowLock> acquiredRowLocks) throws IOException { 3571 int readyToWriteCount = 0; 3572 int lastIndexExclusive = 0; 3573 RowLock prevRowLock = null; 3574 for (; lastIndexExclusive < size(); lastIndexExclusive++) { 3575 // It reaches the miniBatchSize, stop here and process the miniBatch 3576 // This only applies to non-atomic batch operations. 3577 if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) { 3578 break; 3579 } 3580 3581 if (!isOperationPending(lastIndexExclusive)) { 3582 continue; 3583 } 3584 3585 // HBASE-19389 Limit concurrency of put with dense (hundreds) columns to avoid exhausting 3586 // RS handlers, covering both MutationBatchOperation and ReplayBatchOperation 3587 // The BAD_FAMILY/SANITY_CHECK_FAILURE cases are handled in checkAndPrepare phase and won't 3588 // pass the isOperationPending check 3589 Map<byte[], List<Cell>> curFamilyCellMap = 3590 getMutation(lastIndexExclusive).getFamilyCellMap(); 3591 try { 3592 // start the protector before acquiring row lock considering performance, and will finish 3593 // it when encountering exception 3594 region.storeHotnessProtector.start(curFamilyCellMap); 3595 } catch (RegionTooBusyException rtbe) { 3596 region.storeHotnessProtector.finish(curFamilyCellMap); 3597 if (isAtomic()) { 3598 throw rtbe; 3599 } 3600 retCodeDetails[lastIndexExclusive] = 3601 new OperationStatus(OperationStatusCode.STORE_TOO_BUSY, rtbe.getMessage()); 3602 continue; 3603 } 3604 3605 Mutation mutation = getMutation(lastIndexExclusive); 3606 // If we haven't got any rows in our batch, we should block to get the next one. 3607 RowLock rowLock = null; 3608 boolean throwException = false; 3609 try { 3610 // if atomic then get exclusive lock, else shared lock 3611 rowLock = region.getRowLock(mutation.getRow(), !isAtomic(), prevRowLock); 3612 } catch (TimeoutIOException | InterruptedIOException e) { 3613 // NOTE: We will retry when other exceptions, but we should stop if we receive 3614 // TimeoutIOException or InterruptedIOException as operation has timed out or 3615 // interrupted respectively. 3616 throwException = true; 3617 throw e; 3618 } catch (IOException ioe) { 3619 LOG.warn("Failed getting lock, row={}, in region {}", 3620 Bytes.toStringBinary(mutation.getRow()), this, ioe); 3621 if (isAtomic()) { // fail, atomic means all or none 3622 throwException = true; 3623 throw ioe; 3624 } 3625 } catch (Throwable throwable) { 3626 throwException = true; 3627 throw throwable; 3628 } finally { 3629 if (throwException) { 3630 region.storeHotnessProtector.finish(curFamilyCellMap); 3631 } 3632 } 3633 if (rowLock == null) { 3634 // We failed to grab another lock 3635 if (isAtomic()) { 3636 region.storeHotnessProtector.finish(curFamilyCellMap); 3637 throw new IOException("Can't apply all operations atomically!"); 3638 } 3639 break; // Stop acquiring more rows for this batch 3640 } else { 3641 if (rowLock != prevRowLock) { 3642 // It is a different row now, add this to the acquiredRowLocks and 3643 // set prevRowLock to the new returned rowLock 3644 acquiredRowLocks.add(rowLock); 3645 prevRowLock = rowLock; 3646 } 3647 } 3648 3649 readyToWriteCount++; 3650 } 3651 return createMiniBatch(lastIndexExclusive, readyToWriteCount); 3652 } 3653 3654 protected MiniBatchOperationInProgress<Mutation> createMiniBatch(final int lastIndexExclusive, 3655 final int readyToWriteCount) { 3656 return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails, 3657 walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount); 3658 } 3659 3660 protected WALEdit createWALEdit(final MiniBatchOperationInProgress<Mutation> miniBatchOp) { 3661 return new WALEdit(miniBatchOp.getCellCount(), isInReplay()); 3662 } 3663 3664 /** 3665 * Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are 3666 * present, they are merged to result WALEdit. 3667 */ 3668 public List<Pair<NonceKey, WALEdit>> 3669 buildWALEdits(final MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { 3670 List<Pair<NonceKey, WALEdit>> walEdits = new ArrayList<>(); 3671 3672 visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() { 3673 private Pair<NonceKey, WALEdit> curWALEditForNonce; 3674 3675 @Override 3676 public boolean visit(int index) throws IOException { 3677 Mutation m = getMutation(index); 3678 // we use durability of the original mutation for the mutation passed by CP. 3679 if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) { 3680 region.recordMutationWithoutWal(m.getFamilyCellMap()); 3681 /** 3682 * Here is for HBASE-26993,in order to make the new framework for region replication 3683 * could work for SKIP_WAL, we save the {@link Mutation} which 3684 * {@link Mutation#getDurability} is {@link Durability#SKIP_WAL} in miniBatchOp. 3685 */ 3686 cacheSkipWALMutationForRegionReplication(miniBatchOp, walEdits, familyCellMaps[index]); 3687 return true; 3688 } 3689 3690 // the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each. 3691 // Given how nonce keys are originally written, these should be contiguous. 3692 // They don't have to be, it will still work, just write more WALEdits than needed. 3693 long nonceGroup = getNonceGroup(index); 3694 long nonce = getNonce(index); 3695 if ( 3696 curWALEditForNonce == null 3697 || curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup 3698 || curWALEditForNonce.getFirst().getNonce() != nonce 3699 ) { 3700 curWALEditForNonce = 3701 new Pair<>(new NonceKey(nonceGroup, nonce), createWALEdit(miniBatchOp)); 3702 walEdits.add(curWALEditForNonce); 3703 } 3704 WALEdit walEdit = curWALEditForNonce.getSecond(); 3705 3706 // Add WAL edits from CPs. 3707 WALEdit fromCP = walEditsFromCoprocessors[index]; 3708 List<Cell> cellsFromCP = fromCP == null ? Collections.emptyList() : fromCP.getCells(); 3709 addNonSkipWALMutationsToWALEdit(miniBatchOp, walEdit, cellsFromCP, familyCellMaps[index]); 3710 return true; 3711 } 3712 }); 3713 return walEdits; 3714 } 3715 3716 protected void addNonSkipWALMutationsToWALEdit( 3717 final MiniBatchOperationInProgress<Mutation> miniBatchOp, WALEdit walEdit, 3718 List<Cell> cellsFromCP, Map<byte[], List<Cell>> familyCellMap) { 3719 doAddCellsToWALEdit(walEdit, cellsFromCP, familyCellMap); 3720 } 3721 3722 protected static void doAddCellsToWALEdit(WALEdit walEdit, List<Cell> cellsFromCP, 3723 Map<byte[], List<Cell>> familyCellMap) { 3724 walEdit.add(cellsFromCP); 3725 walEdit.add(familyCellMap); 3726 } 3727 3728 protected abstract void cacheSkipWALMutationForRegionReplication( 3729 final MiniBatchOperationInProgress<Mutation> miniBatchOp, 3730 List<Pair<NonceKey, WALEdit>> walEdits, Map<byte[], List<Cell>> familyCellMap); 3731 3732 /** 3733 * This method completes mini-batch operations by calling postBatchMutate() CP hook (if 3734 * required) and completing mvcc. 3735 */ 3736 public void completeMiniBatchOperations( 3737 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 3738 throws IOException { 3739 if (writeEntry != null) { 3740 region.mvcc.completeAndWait(writeEntry); 3741 } 3742 } 3743 3744 public void doPostOpCleanupForMiniBatch( 3745 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WALEdit walEdit, 3746 boolean success) throws IOException { 3747 doFinishHotnessProtector(miniBatchOp); 3748 } 3749 3750 private void 3751 doFinishHotnessProtector(final MiniBatchOperationInProgress<Mutation> miniBatchOp) { 3752 // check and return if the protector is not enabled 3753 if (!region.storeHotnessProtector.isEnable()) { 3754 return; 3755 } 3756 // miniBatchOp is null, if and only if lockRowsAndBuildMiniBatch throwing exception. 3757 // This case was handled. 3758 if (miniBatchOp == null) { 3759 return; 3760 } 3761 3762 final int finalLastIndexExclusive = miniBatchOp.getLastIndexExclusive(); 3763 3764 for (int i = nextIndexToProcess; i < finalLastIndexExclusive; i++) { 3765 switch (retCodeDetails[i].getOperationStatusCode()) { 3766 case SUCCESS: 3767 case FAILURE: 3768 region.storeHotnessProtector.finish(getMutation(i).getFamilyCellMap()); 3769 break; 3770 default: 3771 // do nothing 3772 // We won't start the protector for NOT_RUN/BAD_FAMILY/SANITY_CHECK_FAILURE and the 3773 // STORE_TOO_BUSY case is handled in StoreHotnessProtector#start 3774 break; 3775 } 3776 } 3777 } 3778 3779 /** 3780 * Atomically apply the given map of family->edits to the memstore. This handles the consistency 3781 * control on its own, but the caller should already have locked updatesLock.readLock(). This 3782 * also does <b>not</b> check the families for validity. 3783 * @param familyMap Map of Cells by family 3784 */ 3785 protected void applyFamilyMapToMemStore(Map<byte[], List<Cell>> familyMap, 3786 MemStoreSizing memstoreAccounting) { 3787 for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { 3788 byte[] family = e.getKey(); 3789 List<Cell> cells = e.getValue(); 3790 assert cells instanceof RandomAccess; 3791 region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting); 3792 } 3793 } 3794 } 3795 3796 /** 3797 * Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most of 3798 * the logic is same. 3799 */ 3800 private static class MutationBatchOperation extends BatchOperation<Mutation> { 3801 3802 // For nonce operations 3803 private long nonceGroup; 3804 private long nonce; 3805 protected boolean canProceed; 3806 private boolean regionReplicateEnable; 3807 3808 public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic, 3809 long nonceGroup, long nonce) { 3810 super(region, operations); 3811 this.atomic = atomic; 3812 this.nonceGroup = nonceGroup; 3813 this.nonce = nonce; 3814 this.regionReplicateEnable = region.regionReplicationSink.isPresent(); 3815 } 3816 3817 @Override 3818 public Mutation getMutation(int index) { 3819 return this.operations[index]; 3820 } 3821 3822 @Override 3823 public long getNonceGroup(int index) { 3824 return nonceGroup; 3825 } 3826 3827 @Override 3828 public long getNonce(int index) { 3829 return nonce; 3830 } 3831 3832 @Override 3833 public Mutation[] getMutationsForCoprocs() { 3834 return this.operations; 3835 } 3836 3837 @Override 3838 public boolean isInReplay() { 3839 return false; 3840 } 3841 3842 @Override 3843 public long getOrigLogSeqNum() { 3844 return SequenceId.NO_SEQUENCE_ID; 3845 } 3846 3847 @Override 3848 public void startRegionOperation() throws IOException { 3849 region.startRegionOperation(Operation.BATCH_MUTATE); 3850 } 3851 3852 @Override 3853 public void closeRegionOperation() throws IOException { 3854 region.closeRegionOperation(Operation.BATCH_MUTATE); 3855 } 3856 3857 @Override 3858 public void checkAndPreparePut(Put p) throws IOException { 3859 region.checkFamilies(p.getFamilyCellMap().keySet(), p.getDurability()); 3860 } 3861 3862 @Override 3863 public void checkAndPrepare() throws IOException { 3864 // index 0: puts, index 1: deletes, index 2: increments, index 3: append 3865 final int[] metrics = { 0, 0, 0, 0 }; 3866 3867 visitBatchOperations(true, this.size(), new Visitor() { 3868 private long now = EnvironmentEdgeManager.currentTime(); 3869 private WALEdit walEdit; 3870 3871 @Override 3872 public boolean visit(int index) throws IOException { 3873 // Run coprocessor pre hook outside of locks to avoid deadlock 3874 if (region.coprocessorHost != null) { 3875 if (walEdit == null) { 3876 walEdit = new WALEdit(); 3877 } 3878 callPreMutateCPHook(index, walEdit, metrics); 3879 if (!walEdit.isEmpty()) { 3880 walEditsFromCoprocessors[index] = walEdit; 3881 walEdit = null; 3882 } 3883 } 3884 if (isOperationPending(index)) { 3885 // TODO: Currently validation is done with current time before acquiring locks and 3886 // updates are done with different timestamps after acquiring locks. This behavior is 3887 // inherited from the code prior to this change. Can this be changed? 3888 checkAndPrepareMutation(index, now); 3889 } 3890 return true; 3891 } 3892 }); 3893 3894 // FIXME: we may update metrics twice! here for all operations bypassed by CP and later in 3895 // normal processing. 3896 // Update metrics in same way as it is done when we go the normal processing route (we now 3897 // update general metrics though a Coprocessor did the work). 3898 if (region.metricsRegion != null) { 3899 if (metrics[0] > 0) { 3900 // There were some Puts in the batch. 3901 region.metricsRegion.updatePut(); 3902 } 3903 if (metrics[1] > 0) { 3904 // There were some Deletes in the batch. 3905 region.metricsRegion.updateDelete(); 3906 } 3907 if (metrics[2] > 0) { 3908 // There were some Increment in the batch. 3909 region.metricsRegion.updateIncrement(); 3910 } 3911 if (metrics[3] > 0) { 3912 // There were some Append in the batch. 3913 region.metricsRegion.updateAppend(); 3914 } 3915 } 3916 } 3917 3918 @Override 3919 public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp, 3920 long timestamp, final List<RowLock> acquiredRowLocks) throws IOException { 3921 // For nonce operations 3922 canProceed = startNonceOperation(); 3923 3924 visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { 3925 Mutation mutation = getMutation(index); 3926 if (mutation instanceof Put) { 3927 HRegion.updateCellTimestamps(familyCellMaps[index].values(), Bytes.toBytes(timestamp)); 3928 miniBatchOp.incrementNumOfPuts(); 3929 } else if (mutation instanceof Delete) { 3930 region.prepareDeleteTimestamps(mutation, familyCellMaps[index], Bytes.toBytes(timestamp)); 3931 miniBatchOp.incrementNumOfDeletes(); 3932 } else if (mutation instanceof Increment || mutation instanceof Append) { 3933 boolean returnResults; 3934 if (mutation instanceof Increment) { 3935 returnResults = ((Increment) mutation).isReturnResults(); 3936 } else { 3937 returnResults = ((Append) mutation).isReturnResults(); 3938 } 3939 3940 // For nonce operations 3941 if (!canProceed) { 3942 Result result; 3943 if (returnResults) { 3944 // convert duplicate increment/append to get 3945 List<Cell> results = region.get(toGet(mutation), false, nonceGroup, nonce); 3946 result = Result.create(results); 3947 } else { 3948 result = Result.EMPTY_RESULT; 3949 } 3950 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); 3951 return true; 3952 } 3953 3954 Result result = null; 3955 if (region.coprocessorHost != null) { 3956 if (mutation instanceof Increment) { 3957 result = region.coprocessorHost.preIncrementAfterRowLock((Increment) mutation); 3958 } else { 3959 result = region.coprocessorHost.preAppendAfterRowLock((Append) mutation); 3960 } 3961 } 3962 if (result != null) { 3963 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, 3964 returnResults ? result : Result.EMPTY_RESULT); 3965 return true; 3966 } 3967 3968 List<Cell> results = returnResults ? new ArrayList<>(mutation.size()) : null; 3969 familyCellMaps[index] = reckonDeltas(mutation, results, timestamp); 3970 this.results[index] = results != null ? Result.create(results) : Result.EMPTY_RESULT; 3971 3972 if (mutation instanceof Increment) { 3973 miniBatchOp.incrementNumOfIncrements(); 3974 } else { 3975 miniBatchOp.incrementNumOfAppends(); 3976 } 3977 } 3978 region.rewriteCellTags(familyCellMaps[index], mutation); 3979 3980 // update cell count 3981 if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { 3982 for (List<Cell> cells : mutation.getFamilyCellMap().values()) { 3983 miniBatchOp.addCellCount(cells.size()); 3984 } 3985 } 3986 3987 WALEdit fromCP = walEditsFromCoprocessors[index]; 3988 if (fromCP != null) { 3989 miniBatchOp.addCellCount(fromCP.size()); 3990 } 3991 return true; 3992 }); 3993 3994 if (region.coprocessorHost != null) { 3995 // calling the pre CP hook for batch mutation 3996 region.coprocessorHost.preBatchMutate(miniBatchOp); 3997 checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp); 3998 } 3999 } 4000 4001 /** 4002 * Starts the nonce operation for a mutation, if needed. 4003 * @return whether to proceed this mutation. 4004 */ 4005 private boolean startNonceOperation() throws IOException { 4006 if ( 4007 region.rsServices == null || region.rsServices.getNonceManager() == null 4008 || nonce == HConstants.NO_NONCE 4009 ) { 4010 return true; 4011 } 4012 boolean canProceed; 4013 try { 4014 canProceed = 4015 region.rsServices.getNonceManager().startOperation(nonceGroup, nonce, region.rsServices); 4016 } catch (InterruptedException ex) { 4017 throw new InterruptedIOException("Nonce start operation interrupted"); 4018 } 4019 return canProceed; 4020 } 4021 4022 /** 4023 * Ends nonce operation for a mutation, if needed. 4024 * @param success Whether the operation for this nonce has succeeded. 4025 */ 4026 private void endNonceOperation(boolean success) { 4027 if ( 4028 region.rsServices != null && region.rsServices.getNonceManager() != null 4029 && nonce != HConstants.NO_NONCE 4030 ) { 4031 region.rsServices.getNonceManager().endOperation(nonceGroup, nonce, success); 4032 } 4033 } 4034 4035 private static Get toGet(final Mutation mutation) throws IOException { 4036 assert mutation instanceof Increment || mutation instanceof Append; 4037 Get get = new Get(mutation.getRow()); 4038 CellScanner cellScanner = mutation.cellScanner(); 4039 while (!cellScanner.advance()) { 4040 Cell cell = cellScanner.current(); 4041 get.addColumn(CellUtil.cloneFamily(cell), CellUtil.cloneQualifier(cell)); 4042 } 4043 if (mutation instanceof Increment) { 4044 // Increment 4045 Increment increment = (Increment) mutation; 4046 get.setTimeRange(increment.getTimeRange().getMin(), increment.getTimeRange().getMax()); 4047 } else { 4048 // Append 4049 Append append = (Append) mutation; 4050 get.setTimeRange(append.getTimeRange().getMin(), append.getTimeRange().getMax()); 4051 } 4052 for (Entry<String, byte[]> entry : mutation.getAttributesMap().entrySet()) { 4053 get.setAttribute(entry.getKey(), entry.getValue()); 4054 } 4055 return get; 4056 } 4057 4058 private Map<byte[], List<Cell>> reckonDeltas(Mutation mutation, List<Cell> results, long now) 4059 throws IOException { 4060 assert mutation instanceof Increment || mutation instanceof Append; 4061 Map<byte[], List<Cell>> ret = new TreeMap<>(Bytes.BYTES_COMPARATOR); 4062 // Process a Store/family at a time. 4063 for (Map.Entry<byte[], List<Cell>> entry : mutation.getFamilyCellMap().entrySet()) { 4064 final byte[] columnFamilyName = entry.getKey(); 4065 List<Cell> deltas = entry.getValue(); 4066 // Reckon for the Store what to apply to WAL and MemStore. 4067 List<Cell> toApply = 4068 reckonDeltasByStore(region.stores.get(columnFamilyName), mutation, now, deltas, results); 4069 if (!toApply.isEmpty()) { 4070 for (Cell cell : toApply) { 4071 HStore store = region.getStore(cell); 4072 if (store == null) { 4073 region.checkFamily(CellUtil.cloneFamily(cell)); 4074 } else { 4075 ret.computeIfAbsent(store.getColumnFamilyDescriptor().getName(), 4076 key -> new ArrayList<>()).add(cell); 4077 } 4078 } 4079 } 4080 } 4081 return ret; 4082 } 4083 4084 /** 4085 * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed column 4086 * family/Store. Does Get of current value and then adds passed in deltas for this Store 4087 * returning the result. 4088 * @param mutation The encompassing Mutation object 4089 * @param deltas Changes to apply to this Store; either increment amount or data to append 4090 * @param results In here we accumulate all the Cells we are to return to the client. If null, 4091 * client doesn't want results returned. 4092 * @return Resulting Cells after <code>deltas</code> have been applied to current values. Side 4093 * effect is our filling out of the <code>results</code> List. 4094 */ 4095 private List<Cell> reckonDeltasByStore(HStore store, Mutation mutation, long now, 4096 List<Cell> deltas, List<Cell> results) throws IOException { 4097 assert mutation instanceof Increment || mutation instanceof Append; 4098 byte[] columnFamily = store.getColumnFamilyDescriptor().getName(); 4099 List<Pair<Cell, Cell>> cellPairs = new ArrayList<>(deltas.size()); 4100 4101 // Sort the cells so that they match the order that they appear in the Get results. 4102 // Otherwise, we won't be able to find the existing values if the cells are not specified 4103 // in order by the client since cells are in an array list. 4104 deltas.sort(store.getComparator()); 4105 4106 // Get previous values for all columns in this family. 4107 Get get = new Get(mutation.getRow()); 4108 for (Cell cell : deltas) { 4109 get.addColumn(columnFamily, CellUtil.cloneQualifier(cell)); 4110 } 4111 TimeRange tr; 4112 if (mutation instanceof Increment) { 4113 tr = ((Increment) mutation).getTimeRange(); 4114 } else { 4115 tr = ((Append) mutation).getTimeRange(); 4116 } 4117 4118 if (tr != null) { 4119 get.setTimeRange(tr.getMin(), tr.getMax()); 4120 } 4121 4122 try (RegionScanner scanner = region.getScanner(new Scan(get))) { 4123 // NOTE: Please don't use HRegion.get() instead, 4124 // because it will copy cells to heap. See HBASE-26036 4125 List<Cell> currentValues = new ArrayList<>(); 4126 scanner.next(currentValues); 4127 // Iterate the input columns and update existing values if they were found, otherwise 4128 // add new column initialized to the delta amount 4129 int currentValuesIndex = 0; 4130 for (int i = 0; i < deltas.size(); i++) { 4131 Cell delta = deltas.get(i); 4132 Cell currentValue = null; 4133 if ( 4134 currentValuesIndex < currentValues.size() 4135 && CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta) 4136 ) { 4137 currentValue = currentValues.get(currentValuesIndex); 4138 if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) { 4139 currentValuesIndex++; 4140 } 4141 } 4142 // Switch on whether this an increment or an append building the new Cell to apply. 4143 Cell newCell; 4144 if (mutation instanceof Increment) { 4145 long deltaAmount = getLongValue(delta); 4146 final long newValue = 4147 currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount; 4148 newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, 4149 (oldCell) -> Bytes.toBytes(newValue)); 4150 } else { 4151 newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, 4152 (oldCell) -> ByteBuffer 4153 .wrap(new byte[delta.getValueLength() + oldCell.getValueLength()]) 4154 .put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength()) 4155 .put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength()) 4156 .array()); 4157 } 4158 if (region.maxCellSize > 0) { 4159 int newCellSize = PrivateCellUtil.estimatedSerializedSizeOf(newCell); 4160 if (newCellSize > region.maxCellSize) { 4161 String msg = "Cell with size " + newCellSize + " exceeds limit of " 4162 + region.maxCellSize + " bytes in region " + this; 4163 LOG.debug(msg); 4164 throw new DoNotRetryIOException(msg); 4165 } 4166 } 4167 cellPairs.add(new Pair<>(currentValue, newCell)); 4168 // Add to results to get returned to the Client. If null, cilent does not want results. 4169 if (results != null) { 4170 results.add(newCell); 4171 } 4172 } 4173 // Give coprocessors a chance to update the new cells before apply to WAL or memstore 4174 if (region.coprocessorHost != null) { 4175 // Here the operation must be increment or append. 4176 cellPairs = mutation instanceof Increment 4177 ? region.coprocessorHost.postIncrementBeforeWAL(mutation, cellPairs) 4178 : region.coprocessorHost.postAppendBeforeWAL(mutation, cellPairs); 4179 } 4180 } 4181 return cellPairs.stream().map(Pair::getSecond).collect(Collectors.toList()); 4182 } 4183 4184 private static Cell reckonDelta(final Cell delta, final Cell currentCell, 4185 final byte[] columnFamily, final long now, Mutation mutation, Function<Cell, byte[]> supplier) 4186 throws IOException { 4187 // Forward any tags found on the delta. 4188 List<Tag> tags = TagUtil.carryForwardTags(delta); 4189 if (currentCell != null) { 4190 tags = TagUtil.carryForwardTags(tags, currentCell); 4191 tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); 4192 byte[] newValue = supplier.apply(currentCell); 4193 return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) 4194 .setRow(mutation.getRow(), 0, mutation.getRow().length) 4195 .setFamily(columnFamily, 0, columnFamily.length) 4196 // copy the qualifier if the cell is located in shared memory. 4197 .setQualifier(CellUtil.cloneQualifier(delta)) 4198 .setTimestamp(Math.max(currentCell.getTimestamp() + 1, now)) 4199 .setType(KeyValue.Type.Put.getCode()).setValue(newValue, 0, newValue.length) 4200 .setTags(TagUtil.fromList(tags)).build(); 4201 } else { 4202 tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); 4203 PrivateCellUtil.updateLatestStamp(delta, now); 4204 return CollectionUtils.isEmpty(tags) ? delta : PrivateCellUtil.createCell(delta, tags); 4205 } 4206 } 4207 4208 /** Returns Get the long out of the passed in Cell */ 4209 private static long getLongValue(final Cell cell) throws DoNotRetryIOException { 4210 int len = cell.getValueLength(); 4211 if (len != Bytes.SIZEOF_LONG) { 4212 // throw DoNotRetryIOException instead of IllegalArgumentException 4213 throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide"); 4214 } 4215 return PrivateCellUtil.getValueAsLong(cell); 4216 } 4217 4218 @Override 4219 public List<Pair<NonceKey, WALEdit>> 4220 buildWALEdits(final MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { 4221 List<Pair<NonceKey, WALEdit>> walEdits = super.buildWALEdits(miniBatchOp); 4222 // for MutationBatchOperation, more than one nonce is not allowed 4223 if (walEdits.size() > 1) { 4224 throw new IOException("Found multiple nonce keys per batch!"); 4225 } 4226 return walEdits; 4227 } 4228 4229 /** 4230 * Here is for HBASE-26993,in order to make the new framework for region replication could work 4231 * for SKIP_WAL, we save the {@link Mutation} which {@link Mutation#getDurability} is 4232 * {@link Durability#SKIP_WAL} in miniBatchOp. 4233 */ 4234 @Override 4235 protected void cacheSkipWALMutationForRegionReplication( 4236 MiniBatchOperationInProgress<Mutation> miniBatchOp, 4237 List<Pair<NonceKey, WALEdit>> nonceKeyAndWALEdits, Map<byte[], List<Cell>> familyCellMap) { 4238 if (!this.regionReplicateEnable) { 4239 return; 4240 } 4241 4242 WALEdit walEditForReplicateIfExistsSkipWAL = 4243 miniBatchOp.getWalEditForReplicateIfExistsSkipWAL(); 4244 /** 4245 * When there is a SKIP_WAL {@link Mutation},we create a new {@link WALEdit} for replicating 4246 * to region replica,first we fill the existing {@link WALEdit} to it and then add the 4247 * {@link Mutation} which is SKIP_WAL to it. 4248 */ 4249 if (walEditForReplicateIfExistsSkipWAL == null) { 4250 walEditForReplicateIfExistsSkipWAL = 4251 this.createWALEditForReplicateSkipWAL(miniBatchOp, nonceKeyAndWALEdits); 4252 miniBatchOp.setWalEditForReplicateIfExistsSkipWAL(walEditForReplicateIfExistsSkipWAL); 4253 } 4254 walEditForReplicateIfExistsSkipWAL.add(familyCellMap); 4255 4256 } 4257 4258 private WALEdit createWALEditForReplicateSkipWAL( 4259 MiniBatchOperationInProgress<Mutation> miniBatchOp, 4260 List<Pair<NonceKey, WALEdit>> nonceKeyAndWALEdits) { 4261 if (nonceKeyAndWALEdits.isEmpty()) { 4262 return this.createWALEdit(miniBatchOp); 4263 } 4264 // for MutationBatchOperation, more than one nonce is not allowed 4265 assert nonceKeyAndWALEdits.size() == 1; 4266 WALEdit currentWALEdit = nonceKeyAndWALEdits.get(0).getSecond(); 4267 return new WALEdit(currentWALEdit); 4268 } 4269 4270 @Override 4271 protected void addNonSkipWALMutationsToWALEdit( 4272 final MiniBatchOperationInProgress<Mutation> miniBatchOp, WALEdit walEdit, 4273 List<Cell> cellsFromCP, Map<byte[], List<Cell>> familyCellMap) { 4274 4275 super.addNonSkipWALMutationsToWALEdit(miniBatchOp, walEdit, cellsFromCP, familyCellMap); 4276 WALEdit walEditForReplicateIfExistsSkipWAL = 4277 miniBatchOp.getWalEditForReplicateIfExistsSkipWAL(); 4278 if (walEditForReplicateIfExistsSkipWAL == null) { 4279 return; 4280 } 4281 /** 4282 * When walEditForReplicateIfExistsSkipWAL is not null,it means there exists SKIP_WAL 4283 * {@link Mutation} and we create a new {@link WALEdit} in 4284 * {@link MutationBatchOperation#cacheSkipWALMutationForReplicateRegionReplica} for 4285 * replicating to region replica, so here we also add non SKIP_WAL{@link Mutation}s to 4286 * walEditForReplicateIfExistsSkipWAL. 4287 */ 4288 doAddCellsToWALEdit(walEditForReplicateIfExistsSkipWAL, cellsFromCP, familyCellMap); 4289 } 4290 4291 @Override 4292 public WriteEntry writeMiniBatchOperationsToMemStore( 4293 final MiniBatchOperationInProgress<Mutation> miniBatchOp, @Nullable WriteEntry writeEntry, 4294 long now) throws IOException { 4295 boolean newWriteEntry = false; 4296 if (writeEntry == null) { 4297 writeEntry = region.mvcc.begin(); 4298 newWriteEntry = true; 4299 } 4300 super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber()); 4301 if (newWriteEntry) { 4302 /** 4303 * Here is for HBASE-26993 case 2,all {@link Mutation}s are {@link Durability#SKIP_WAL}. In 4304 * order to make the new framework for region replication could work for SKIP_WAL,because 4305 * there is no {@link RegionReplicationSink#add} attached in {@link HRegion#doWALAppend},so 4306 * here we get {@link WALEdit} from 4307 * {@link MiniBatchOperationInProgress#getWalEditForReplicateIfExistsSkipWAL} and attach 4308 * {@link RegionReplicationSink#add} to the new mvcc writeEntry. 4309 */ 4310 attachRegionReplicationToMVCCEntry(miniBatchOp, writeEntry, now); 4311 } 4312 return writeEntry; 4313 } 4314 4315 private WALKeyImpl createWALKey(long now) { 4316 // for MutationBatchOperation,isReplay is false. 4317 return this.region.createWALKeyForWALAppend(false, this, now, this.nonceGroup, this.nonce); 4318 } 4319 4320 /** 4321 * Create {@link WALKeyImpl} and get {@link WALEdit} from miniBatchOp and attach 4322 * {@link RegionReplicationSink#add} to the mvccWriteEntry. 4323 */ 4324 private void attachRegionReplicationToMVCCEntry( 4325 final MiniBatchOperationInProgress<Mutation> miniBatchOp, WriteEntry mvccWriteEntry, long now) 4326 throws IOException { 4327 if (!this.regionReplicateEnable) { 4328 return; 4329 } 4330 assert !mvccWriteEntry.getCompletionAction().isPresent(); 4331 4332 final WALKeyImpl walKey = this.createWALKey(now); 4333 walKey.setWriteEntry(mvccWriteEntry); 4334 region.doAttachReplicateRegionReplicaAction(walKey, 4335 miniBatchOp.getWalEditForReplicateIfExistsSkipWAL(), mvccWriteEntry); 4336 } 4337 4338 @Override 4339 public void completeMiniBatchOperations( 4340 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 4341 throws IOException { 4342 // TODO: can it be done after completing mvcc? 4343 // calling the post CP hook for batch mutation 4344 if (region.coprocessorHost != null) { 4345 region.coprocessorHost.postBatchMutate(miniBatchOp); 4346 } 4347 super.completeMiniBatchOperations(miniBatchOp, writeEntry); 4348 4349 if (nonce != HConstants.NO_NONCE) { 4350 if (region.rsServices != null && region.rsServices.getNonceManager() != null) { 4351 region.rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce, 4352 writeEntry.getWriteNumber()); 4353 } 4354 } 4355 } 4356 4357 @Override 4358 public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress<Mutation> miniBatchOp, 4359 final WALEdit walEdit, boolean success) throws IOException { 4360 4361 super.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, success); 4362 if (miniBatchOp != null) { 4363 // synced so that the coprocessor contract is adhered to. 4364 if (region.coprocessorHost != null) { 4365 visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { 4366 // only for successful puts/deletes/increments/appends 4367 if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) { 4368 Mutation m = getMutation(i); 4369 if (m instanceof Put) { 4370 region.coprocessorHost.postPut((Put) m, walEdit); 4371 } else if (m instanceof Delete) { 4372 region.coprocessorHost.postDelete((Delete) m, walEdit); 4373 } else if (m instanceof Increment) { 4374 Result result = 4375 region.getCoprocessorHost().postIncrement((Increment) m, results[i], walEdit); 4376 if (result != results[i]) { 4377 retCodeDetails[i] = 4378 new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); 4379 } 4380 } else if (m instanceof Append) { 4381 Result result = 4382 region.getCoprocessorHost().postAppend((Append) m, results[i], walEdit); 4383 if (result != results[i]) { 4384 retCodeDetails[i] = 4385 new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); 4386 } 4387 } 4388 } 4389 return true; 4390 }); 4391 } 4392 4393 // For nonce operations 4394 if (canProceed && nonce != HConstants.NO_NONCE) { 4395 boolean[] areAllIncrementsAndAppendsSuccessful = new boolean[] { true }; 4396 visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { 4397 Mutation mutation = getMutation(i); 4398 if (mutation instanceof Increment || mutation instanceof Append) { 4399 if (retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.SUCCESS) { 4400 areAllIncrementsAndAppendsSuccessful[0] = false; 4401 return false; 4402 } 4403 } 4404 return true; 4405 }); 4406 endNonceOperation(areAllIncrementsAndAppendsSuccessful[0]); 4407 } 4408 4409 // See if the column families were consistent through the whole thing. 4410 // if they were then keep them. If they were not then pass a null. 4411 // null will be treated as unknown. 4412 // Total time taken might be involving Puts, Deletes, Increments and Appends. 4413 // Split the time for puts and deletes based on the total number of Puts, Deletes, 4414 // Increments and Appends. 4415 if (region.metricsRegion != null) { 4416 if (miniBatchOp.getNumOfPuts() > 0) { 4417 // There were some Puts in the batch. 4418 region.metricsRegion.updatePut(); 4419 } 4420 if (miniBatchOp.getNumOfDeletes() > 0) { 4421 // There were some Deletes in the batch. 4422 region.metricsRegion.updateDelete(); 4423 } 4424 if (miniBatchOp.getNumOfIncrements() > 0) { 4425 // There were some Increments in the batch. 4426 region.metricsRegion.updateIncrement(); 4427 } 4428 if (miniBatchOp.getNumOfAppends() > 0) { 4429 // There were some Appends in the batch. 4430 region.metricsRegion.updateAppend(); 4431 } 4432 } 4433 } 4434 4435 if (region.coprocessorHost != null) { 4436 // call the coprocessor hook to do any finalization steps after the put is done 4437 region.coprocessorHost.postBatchMutateIndispensably( 4438 miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success); 4439 } 4440 } 4441 4442 /** 4443 * Runs prePut/preDelete/preIncrement/preAppend coprocessor hook for input mutation in a batch 4444 * @param metrics Array of 2 ints. index 0: count of puts, index 1: count of deletes, index 2: 4445 * count of increments and 3: count of appends 4446 */ 4447 private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics) 4448 throws IOException { 4449 Mutation m = getMutation(index); 4450 if (m instanceof Put) { 4451 if (region.coprocessorHost.prePut((Put) m, walEdit)) { 4452 // pre hook says skip this Put 4453 // mark as success and skip in doMiniBatchMutation 4454 metrics[0]++; 4455 retCodeDetails[index] = OperationStatus.SUCCESS; 4456 } 4457 } else if (m instanceof Delete) { 4458 Delete curDel = (Delete) m; 4459 if (curDel.getFamilyCellMap().isEmpty()) { 4460 // handle deleting a row case 4461 // TODO: prepareDelete() has been called twice, before and after preDelete() CP hook. 4462 // Can this be avoided? 4463 region.prepareDelete(curDel); 4464 } 4465 if (region.coprocessorHost.preDelete(curDel, walEdit)) { 4466 // pre hook says skip this Delete 4467 // mark as success and skip in doMiniBatchMutation 4468 metrics[1]++; 4469 retCodeDetails[index] = OperationStatus.SUCCESS; 4470 } 4471 } else if (m instanceof Increment) { 4472 Increment increment = (Increment) m; 4473 Result result = region.coprocessorHost.preIncrement(increment, walEdit); 4474 if (result != null) { 4475 // pre hook says skip this Increment 4476 // mark as success and skip in doMiniBatchMutation 4477 metrics[2]++; 4478 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); 4479 } 4480 } else if (m instanceof Append) { 4481 Append append = (Append) m; 4482 Result result = region.coprocessorHost.preAppend(append, walEdit); 4483 if (result != null) { 4484 // pre hook says skip this Append 4485 // mark as success and skip in doMiniBatchMutation 4486 metrics[3]++; 4487 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); 4488 } 4489 } else { 4490 String msg = "Put/Delete/Increment/Append mutations only supported in a batch"; 4491 retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg); 4492 if (isAtomic()) { // fail, atomic means all or none 4493 throw new IOException(msg); 4494 } 4495 } 4496 } 4497 4498 // TODO Support Increment/Append operations 4499 private void checkAndMergeCPMutations(final MiniBatchOperationInProgress<Mutation> miniBatchOp, 4500 final List<RowLock> acquiredRowLocks, final long timestamp) throws IOException { 4501 visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> { 4502 // we pass (i - firstIndex) below since the call expects a relative index 4503 Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess); 4504 if (cpMutations == null) { 4505 return true; 4506 } 4507 // Else Coprocessor added more Mutations corresponding to the Mutation at this index. 4508 Mutation mutation = getMutation(i); 4509 for (Mutation cpMutation : cpMutations) { 4510 this.checkAndPrepareMutation(cpMutation, timestamp); 4511 4512 // Acquire row locks. If not, the whole batch will fail. 4513 acquiredRowLocks.add(region.getRowLock(cpMutation.getRow(), true, null)); 4514 4515 // Returned mutations from coprocessor correspond to the Mutation at index i. We can 4516 // directly add the cells from those mutations to the familyMaps of this mutation. 4517 Map<byte[], List<Cell>> cpFamilyMap = cpMutation.getFamilyCellMap(); 4518 region.rewriteCellTags(cpFamilyMap, mutation); 4519 // will get added to the memStore later 4520 mergeFamilyMaps(familyCellMaps[i], cpFamilyMap); 4521 4522 // The durability of returned mutation is replaced by the corresponding mutation. 4523 // If the corresponding mutation contains the SKIP_WAL, we shouldn't count the 4524 // cells of returned mutation. 4525 if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { 4526 for (List<Cell> cells : cpFamilyMap.values()) { 4527 miniBatchOp.addCellCount(cells.size()); 4528 } 4529 } 4530 } 4531 return true; 4532 }); 4533 } 4534 4535 private void mergeFamilyMaps(Map<byte[], List<Cell>> familyMap, 4536 Map<byte[], List<Cell>> toBeMerged) { 4537 for (Map.Entry<byte[], List<Cell>> entry : toBeMerged.entrySet()) { 4538 List<Cell> cells = familyMap.get(entry.getKey()); 4539 if (cells == null) { 4540 familyMap.put(entry.getKey(), entry.getValue()); 4541 } else { 4542 cells.addAll(entry.getValue()); 4543 } 4544 } 4545 } 4546 } 4547 4548 /** 4549 * Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most 4550 * of the logic is same. 4551 * @deprecated Since 3.0.0, will be removed in 4.0.0. Now we will not use this operation to apply 4552 * edits at secondary replica side. 4553 */ 4554 @Deprecated 4555 private static final class ReplayBatchOperation extends BatchOperation<MutationReplay> { 4556 4557 private long origLogSeqNum = 0; 4558 4559 public ReplayBatchOperation(final HRegion region, MutationReplay[] operations, 4560 long origLogSeqNum) { 4561 super(region, operations); 4562 this.origLogSeqNum = origLogSeqNum; 4563 } 4564 4565 @Override 4566 public Mutation getMutation(int index) { 4567 return this.operations[index].mutation; 4568 } 4569 4570 @Override 4571 public long getNonceGroup(int index) { 4572 return this.operations[index].nonceGroup; 4573 } 4574 4575 @Override 4576 public long getNonce(int index) { 4577 return this.operations[index].nonce; 4578 } 4579 4580 @Override 4581 public Mutation[] getMutationsForCoprocs() { 4582 return null; 4583 } 4584 4585 @Override 4586 public boolean isInReplay() { 4587 return true; 4588 } 4589 4590 @Override 4591 public long getOrigLogSeqNum() { 4592 return this.origLogSeqNum; 4593 } 4594 4595 @Override 4596 public void startRegionOperation() throws IOException { 4597 region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE); 4598 } 4599 4600 @Override 4601 public void closeRegionOperation() throws IOException { 4602 region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE); 4603 } 4604 4605 /** 4606 * During replay, there could exist column families which are removed between region server 4607 * failure and replay 4608 */ 4609 @Override 4610 protected void checkAndPreparePut(Put p) throws IOException { 4611 Map<byte[], List<Cell>> familyCellMap = p.getFamilyCellMap(); 4612 List<byte[]> nonExistentList = null; 4613 for (byte[] family : familyCellMap.keySet()) { 4614 if (!region.htableDescriptor.hasColumnFamily(family)) { 4615 if (nonExistentList == null) { 4616 nonExistentList = new ArrayList<>(); 4617 } 4618 nonExistentList.add(family); 4619 } 4620 } 4621 if (nonExistentList != null) { 4622 for (byte[] family : nonExistentList) { 4623 // Perhaps schema was changed between crash and replay 4624 LOG.info("No family for {} omit from reply in region {}.", Bytes.toString(family), this); 4625 familyCellMap.remove(family); 4626 } 4627 } 4628 } 4629 4630 @Override 4631 public void checkAndPrepare() throws IOException { 4632 long now = EnvironmentEdgeManager.currentTime(); 4633 visitBatchOperations(true, this.size(), (int index) -> { 4634 checkAndPrepareMutation(index, now); 4635 return true; 4636 }); 4637 } 4638 4639 @Override 4640 public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp, 4641 long timestamp, final List<RowLock> acquiredRowLocks) throws IOException { 4642 visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { 4643 // update cell count 4644 for (List<Cell> cells : getMutation(index).getFamilyCellMap().values()) { 4645 miniBatchOp.addCellCount(cells.size()); 4646 } 4647 return true; 4648 }); 4649 } 4650 4651 @Override 4652 public WriteEntry writeMiniBatchOperationsToMemStore( 4653 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry, 4654 long now) throws IOException { 4655 super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum()); 4656 return writeEntry; 4657 } 4658 4659 @Override 4660 public void completeMiniBatchOperations( 4661 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 4662 throws IOException { 4663 super.completeMiniBatchOperations(miniBatchOp, writeEntry); 4664 region.mvcc.advanceTo(getOrigLogSeqNum()); 4665 } 4666 4667 @Override 4668 protected void cacheSkipWALMutationForRegionReplication( 4669 MiniBatchOperationInProgress<Mutation> miniBatchOp, List<Pair<NonceKey, WALEdit>> walEdits, 4670 Map<byte[], List<Cell>> familyCellMap) { 4671 // There is no action to do if current region is secondary replica 4672 } 4673 4674 } 4675 4676 public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, 4677 long nonce) throws IOException { 4678 // As it stands, this is used for 3 things 4679 // * batchMutate with single mutation - put/delete/increment/append, separate or from 4680 // checkAndMutate. 4681 // * coprocessor calls (see ex. BulkDeleteEndpoint). 4682 // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd... 4683 return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce)); 4684 } 4685 4686 @Override 4687 public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException { 4688 // If the mutations has any Increment/Append operations, we need to do batchMutate atomically 4689 boolean atomic = 4690 Arrays.stream(mutations).anyMatch(m -> m instanceof Increment || m instanceof Append); 4691 return batchMutate(mutations, atomic); 4692 } 4693 4694 OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic) throws IOException { 4695 return TraceUtil.trace( 4696 () -> batchMutate(mutations, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE), 4697 () -> createRegionSpan("Region.batchMutate")); 4698 } 4699 4700 /** 4701 * @deprecated Since 3.0.0, will be removed in 4.0.0. Now we use 4702 * {@link #replayWALEntry(WALEntry, CellScanner)} for replaying edits at secondary 4703 * replica side. 4704 */ 4705 @Deprecated 4706 OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId) throws IOException { 4707 if ( 4708 !RegionReplicaUtil.isDefaultReplica(getRegionInfo()) 4709 && replaySeqId < lastReplayedOpenRegionSeqId 4710 ) { 4711 // if it is a secondary replica we should ignore these entries silently 4712 // since they are coming out of order 4713 if (LOG.isTraceEnabled()) { 4714 LOG.trace(getRegionInfo().getEncodedName() + " : " + "Skipping " + mutations.length 4715 + " mutations with replaySeqId=" + replaySeqId 4716 + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); 4717 for (MutationReplay mut : mutations) { 4718 LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation); 4719 } 4720 } 4721 4722 OperationStatus[] statuses = new OperationStatus[mutations.length]; 4723 for (int i = 0; i < statuses.length; i++) { 4724 statuses[i] = OperationStatus.SUCCESS; 4725 } 4726 return statuses; 4727 } 4728 return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId)); 4729 } 4730 4731 /** 4732 * Perform a batch of mutations. 4733 * <p/> 4734 * Operations in a batch are stored with highest durability specified of for all operations in a 4735 * batch, except for {@link Durability#SKIP_WAL}. 4736 * <p/> 4737 * This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with 4738 * {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[])} with 4739 * {@link MutationBatchOperation} instance as an argument. As the processing of replay batch and 4740 * mutation batch is very similar, lot of code is shared by providing generic methods in base 4741 * class {@link BatchOperation}. The logic for this method and 4742 * {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which are 4743 * overridden by derived classes to implement special behavior. 4744 * @param batchOp contains the list of mutations 4745 * @return an array of OperationStatus which internally contains the OperationStatusCode and the 4746 * exceptionMessage if any. 4747 * @throws IOException if an IO problem is encountered 4748 */ 4749 private OperationStatus[] batchMutate(BatchOperation<?> batchOp) throws IOException { 4750 boolean initialized = false; 4751 batchOp.startRegionOperation(); 4752 try { 4753 while (!batchOp.isDone()) { 4754 if (!batchOp.isInReplay()) { 4755 checkReadOnly(); 4756 } 4757 checkResources(); 4758 4759 if (!initialized) { 4760 this.writeRequestsCount.add(batchOp.size()); 4761 // validate and prepare batch for write, for MutationBatchOperation it also calls CP 4762 // prePut()/preDelete()/preIncrement()/preAppend() hooks 4763 batchOp.checkAndPrepare(); 4764 initialized = true; 4765 } 4766 doMiniBatchMutate(batchOp); 4767 requestFlushIfNeeded(); 4768 } 4769 } finally { 4770 if (rsServices != null && rsServices.getMetrics() != null) { 4771 rsServices.getMetrics().updateWriteQueryMeter(this, batchOp.size()); 4772 } 4773 batchOp.closeRegionOperation(); 4774 } 4775 return batchOp.retCodeDetails; 4776 } 4777 4778 /** 4779 * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[])} In here we 4780 * also handle replay of edits on region recover. Also gets change in size brought about by 4781 * applying {@code batchOp}. 4782 */ 4783 private void doMiniBatchMutate(BatchOperation<?> batchOp) throws IOException { 4784 boolean success = false; 4785 WALEdit walEdit = null; 4786 WriteEntry writeEntry = null; 4787 boolean locked = false; 4788 // We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive) 4789 MiniBatchOperationInProgress<Mutation> miniBatchOp = null; 4790 /** Keep track of the locks we hold so we can release them in finally clause */ 4791 List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size()); 4792 4793 // Check for thread interrupt status in case we have been signaled from 4794 // #interruptRegionOperation. 4795 checkInterrupt(); 4796 4797 try { 4798 // STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with 4799 // locked rows 4800 miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks); 4801 4802 // We've now grabbed as many mutations off the list as we can 4803 // Ensure we acquire at least one. 4804 if (miniBatchOp.getReadyToWriteCount() <= 0) { 4805 // Nothing to put/delete/increment/append -- an exception in the above such as 4806 // NoSuchColumnFamily? 4807 return; 4808 } 4809 4810 // Check for thread interrupt status in case we have been signaled from 4811 // #interruptRegionOperation. Do it before we take the lock and disable interrupts for 4812 // the WAL append. 4813 checkInterrupt(); 4814 4815 lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount()); 4816 locked = true; 4817 4818 // From this point until memstore update this operation should not be interrupted. 4819 disableInterrupts(); 4820 4821 // STEP 2. Update mini batch of all operations in progress with LATEST_TIMESTAMP timestamp 4822 // We should record the timestamp only after we have acquired the rowLock, 4823 // otherwise, newer puts/deletes/increment/append are not guaranteed to have a newer 4824 // timestamp 4825 4826 long now = EnvironmentEdgeManager.currentTime(); 4827 batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks); 4828 4829 // STEP 3. Build WAL edit 4830 4831 List<Pair<NonceKey, WALEdit>> walEdits = batchOp.buildWALEdits(miniBatchOp); 4832 4833 // STEP 4. Append the WALEdits to WAL and sync. 4834 4835 for (Iterator<Pair<NonceKey, WALEdit>> it = walEdits.iterator(); it.hasNext();) { 4836 Pair<NonceKey, WALEdit> nonceKeyWALEditPair = it.next(); 4837 walEdit = nonceKeyWALEditPair.getSecond(); 4838 NonceKey nonceKey = nonceKeyWALEditPair.getFirst(); 4839 4840 if (walEdit != null && !walEdit.isEmpty()) { 4841 writeEntry = doWALAppend(walEdit, batchOp, miniBatchOp, now, nonceKey); 4842 } 4843 4844 // Complete mvcc for all but last writeEntry (for replay case) 4845 if (it.hasNext() && writeEntry != null) { 4846 mvcc.complete(writeEntry); 4847 writeEntry = null; 4848 } 4849 } 4850 4851 // STEP 5. Write back to memStore 4852 // NOTE: writeEntry can be null here 4853 writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry, now); 4854 4855 // STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and 4856 // complete mvcc for last writeEntry 4857 batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry); 4858 writeEntry = null; 4859 success = true; 4860 } finally { 4861 // Call complete rather than completeAndWait because we probably had error if walKey != null 4862 if (writeEntry != null) mvcc.complete(writeEntry); 4863 4864 if (locked) { 4865 this.updatesLock.readLock().unlock(); 4866 } 4867 releaseRowLocks(acquiredRowLocks); 4868 4869 enableInterrupts(); 4870 4871 final int finalLastIndexExclusive = 4872 miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size(); 4873 final boolean finalSuccess = success; 4874 batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> { 4875 Mutation mutation = batchOp.getMutation(i); 4876 if (mutation instanceof Increment || mutation instanceof Append) { 4877 if (finalSuccess) { 4878 batchOp.retCodeDetails[i] = 4879 new OperationStatus(OperationStatusCode.SUCCESS, batchOp.results[i]); 4880 } else { 4881 batchOp.retCodeDetails[i] = OperationStatus.FAILURE; 4882 } 4883 } else { 4884 batchOp.retCodeDetails[i] = 4885 finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE; 4886 } 4887 return true; 4888 }); 4889 4890 batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess); 4891 4892 batchOp.nextIndexToProcess = finalLastIndexExclusive; 4893 } 4894 } 4895 4896 /** 4897 * Returns effective durability from the passed durability and the table descriptor. 4898 */ 4899 private Durability getEffectiveDurability(Durability d) { 4900 return d == Durability.USE_DEFAULT ? this.regionDurability : d; 4901 } 4902 4903 @Override 4904 @Deprecated 4905 public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, 4906 ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException { 4907 CheckAndMutate checkAndMutate; 4908 try { 4909 CheckAndMutate.Builder builder = CheckAndMutate.newBuilder(row) 4910 .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange); 4911 if (mutation instanceof Put) { 4912 checkAndMutate = builder.build((Put) mutation); 4913 } else if (mutation instanceof Delete) { 4914 checkAndMutate = builder.build((Delete) mutation); 4915 } else { 4916 throw new DoNotRetryIOException( 4917 "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); 4918 } 4919 } catch (IllegalArgumentException e) { 4920 throw new DoNotRetryIOException(e.getMessage()); 4921 } 4922 return checkAndMutate(checkAndMutate).isSuccess(); 4923 } 4924 4925 @Override 4926 @Deprecated 4927 public boolean checkAndMutate(byte[] row, Filter filter, TimeRange timeRange, Mutation mutation) 4928 throws IOException { 4929 CheckAndMutate checkAndMutate; 4930 try { 4931 CheckAndMutate.Builder builder = 4932 CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange); 4933 if (mutation instanceof Put) { 4934 checkAndMutate = builder.build((Put) mutation); 4935 } else if (mutation instanceof Delete) { 4936 checkAndMutate = builder.build((Delete) mutation); 4937 } else { 4938 throw new DoNotRetryIOException( 4939 "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); 4940 } 4941 } catch (IllegalArgumentException e) { 4942 throw new DoNotRetryIOException(e.getMessage()); 4943 } 4944 return checkAndMutate(checkAndMutate).isSuccess(); 4945 } 4946 4947 @Override 4948 @Deprecated 4949 public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, 4950 ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException { 4951 CheckAndMutate checkAndMutate; 4952 try { 4953 checkAndMutate = CheckAndMutate.newBuilder(row) 4954 .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange).build(rm); 4955 } catch (IllegalArgumentException e) { 4956 throw new DoNotRetryIOException(e.getMessage()); 4957 } 4958 return checkAndMutate(checkAndMutate).isSuccess(); 4959 } 4960 4961 @Override 4962 @Deprecated 4963 public boolean checkAndRowMutate(byte[] row, Filter filter, TimeRange timeRange, RowMutations rm) 4964 throws IOException { 4965 CheckAndMutate checkAndMutate; 4966 try { 4967 checkAndMutate = 4968 CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange).build(rm); 4969 } catch (IllegalArgumentException e) { 4970 throw new DoNotRetryIOException(e.getMessage()); 4971 } 4972 return checkAndMutate(checkAndMutate).isSuccess(); 4973 } 4974 4975 @Override 4976 public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate) throws IOException { 4977 return checkAndMutate(checkAndMutate, HConstants.NO_NONCE, HConstants.NO_NONCE); 4978 } 4979 4980 public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate, long nonceGroup, 4981 long nonce) throws IOException { 4982 return TraceUtil.trace(() -> checkAndMutateInternal(checkAndMutate, nonceGroup, nonce), 4983 () -> createRegionSpan("Region.checkAndMutate")); 4984 } 4985 4986 private CheckAndMutateResult checkAndMutateInternal(CheckAndMutate checkAndMutate, 4987 long nonceGroup, long nonce) throws IOException { 4988 byte[] row = checkAndMutate.getRow(); 4989 Filter filter = null; 4990 byte[] family = null; 4991 byte[] qualifier = null; 4992 CompareOperator op = null; 4993 ByteArrayComparable comparator = null; 4994 if (checkAndMutate.hasFilter()) { 4995 filter = checkAndMutate.getFilter(); 4996 } else { 4997 family = checkAndMutate.getFamily(); 4998 qualifier = checkAndMutate.getQualifier(); 4999 op = checkAndMutate.getCompareOp(); 5000 comparator = new BinaryComparator(checkAndMutate.getValue()); 5001 } 5002 TimeRange timeRange = checkAndMutate.getTimeRange(); 5003 5004 Mutation mutation = null; 5005 RowMutations rowMutations = null; 5006 if (checkAndMutate.getAction() instanceof Mutation) { 5007 mutation = (Mutation) checkAndMutate.getAction(); 5008 } else { 5009 rowMutations = (RowMutations) checkAndMutate.getAction(); 5010 } 5011 5012 if (mutation != null) { 5013 checkMutationType(mutation); 5014 checkRow(mutation, row); 5015 } else { 5016 checkRow(rowMutations, row); 5017 } 5018 checkReadOnly(); 5019 // TODO, add check for value length also move this check to the client 5020 checkResources(); 5021 startRegionOperation(); 5022 try { 5023 Get get = new Get(row); 5024 if (family != null) { 5025 checkFamily(family); 5026 get.addColumn(family, qualifier); 5027 } 5028 if (filter != null) { 5029 get.setFilter(filter); 5030 } 5031 if (timeRange != null) { 5032 get.setTimeRange(timeRange.getMin(), timeRange.getMax()); 5033 } 5034 // Lock row - note that doBatchMutate will relock this row if called 5035 checkRow(row, "doCheckAndRowMutate"); 5036 RowLock rowLock = getRowLock(get.getRow(), false, null); 5037 try { 5038 if (this.getCoprocessorHost() != null) { 5039 CheckAndMutateResult result = 5040 getCoprocessorHost().preCheckAndMutateAfterRowLock(checkAndMutate); 5041 if (result != null) { 5042 return result; 5043 } 5044 } 5045 5046 // NOTE: We used to wait here until mvcc caught up: mvcc.await(); 5047 // Supposition is that now all changes are done under row locks, then when we go to read, 5048 // we'll get the latest on this row. 5049 boolean matches = false; 5050 long cellTs = 0; 5051 try (RegionScanner scanner = getScanner(new Scan(get))) { 5052 // NOTE: Please don't use HRegion.get() instead, 5053 // because it will copy cells to heap. See HBASE-26036 5054 List<Cell> result = new ArrayList<>(1); 5055 scanner.next(result); 5056 if (filter != null) { 5057 if (!result.isEmpty()) { 5058 matches = true; 5059 cellTs = result.get(0).getTimestamp(); 5060 } 5061 } else { 5062 boolean valueIsNull = 5063 comparator.getValue() == null || comparator.getValue().length == 0; 5064 if (result.isEmpty() && valueIsNull) { 5065 matches = op != CompareOperator.NOT_EQUAL; 5066 } else if (result.size() > 0 && valueIsNull) { 5067 matches = (result.get(0).getValueLength() == 0) == (op != CompareOperator.NOT_EQUAL); 5068 cellTs = result.get(0).getTimestamp(); 5069 } else if (result.size() == 1) { 5070 Cell kv = result.get(0); 5071 cellTs = kv.getTimestamp(); 5072 int compareResult = PrivateCellUtil.compareValue(kv, comparator); 5073 matches = matches(op, compareResult); 5074 } 5075 } 5076 } 5077 5078 // If matches, perform the mutation or the rowMutations 5079 if (matches) { 5080 // We have acquired the row lock already. If the system clock is NOT monotonically 5081 // non-decreasing (see HBASE-14070) we should make sure that the mutation has a 5082 // larger timestamp than what was observed via Get. doBatchMutate already does this, but 5083 // there is no way to pass the cellTs. See HBASE-14054. 5084 long now = EnvironmentEdgeManager.currentTime(); 5085 long ts = Math.max(now, cellTs); // ensure write is not eclipsed 5086 byte[] byteTs = Bytes.toBytes(ts); 5087 if (mutation != null) { 5088 if (mutation instanceof Put) { 5089 updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs); 5090 } 5091 // And else 'delete' is not needed since it already does a second get, and sets the 5092 // timestamp from get (see prepareDeleteTimestamps). 5093 } else { 5094 for (Mutation m : rowMutations.getMutations()) { 5095 if (m instanceof Put) { 5096 updateCellTimestamps(m.getFamilyCellMap().values(), byteTs); 5097 } 5098 } 5099 // And else 'delete' is not needed since it already does a second get, and sets the 5100 // timestamp from get (see prepareDeleteTimestamps). 5101 } 5102 // All edits for the given row (across all column families) must happen atomically. 5103 Result r; 5104 if (mutation != null) { 5105 r = mutate(mutation, true, nonceGroup, nonce).getResult(); 5106 } else { 5107 r = mutateRow(rowMutations, nonceGroup, nonce); 5108 } 5109 this.checkAndMutateChecksPassed.increment(); 5110 return new CheckAndMutateResult(true, r); 5111 } 5112 this.checkAndMutateChecksFailed.increment(); 5113 return new CheckAndMutateResult(false, null); 5114 } finally { 5115 rowLock.release(); 5116 } 5117 } finally { 5118 closeRegionOperation(); 5119 } 5120 } 5121 5122 private void checkMutationType(final Mutation mutation) throws DoNotRetryIOException { 5123 if ( 5124 !(mutation instanceof Put) && !(mutation instanceof Delete) 5125 && !(mutation instanceof Increment) && !(mutation instanceof Append) 5126 ) { 5127 throw new org.apache.hadoop.hbase.DoNotRetryIOException( 5128 "Action must be Put or Delete or Increment or Delete"); 5129 } 5130 } 5131 5132 private void checkRow(final Row action, final byte[] row) throws DoNotRetryIOException { 5133 if (!Bytes.equals(row, action.getRow())) { 5134 throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match"); 5135 } 5136 } 5137 5138 private boolean matches(final CompareOperator op, final int compareResult) { 5139 boolean matches = false; 5140 switch (op) { 5141 case LESS: 5142 matches = compareResult < 0; 5143 break; 5144 case LESS_OR_EQUAL: 5145 matches = compareResult <= 0; 5146 break; 5147 case EQUAL: 5148 matches = compareResult == 0; 5149 break; 5150 case NOT_EQUAL: 5151 matches = compareResult != 0; 5152 break; 5153 case GREATER_OR_EQUAL: 5154 matches = compareResult >= 0; 5155 break; 5156 case GREATER: 5157 matches = compareResult > 0; 5158 break; 5159 default: 5160 throw new RuntimeException("Unknown Compare op " + op.name()); 5161 } 5162 return matches; 5163 } 5164 5165 private OperationStatus mutate(Mutation mutation) throws IOException { 5166 return mutate(mutation, false); 5167 } 5168 5169 private OperationStatus mutate(Mutation mutation, boolean atomic) throws IOException { 5170 return mutate(mutation, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE); 5171 } 5172 5173 private OperationStatus mutate(Mutation mutation, boolean atomic, long nonceGroup, long nonce) 5174 throws IOException { 5175 OperationStatus[] status = 5176 this.batchMutate(new Mutation[] { mutation }, atomic, nonceGroup, nonce); 5177 if (status[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) { 5178 throw new FailedSanityCheckException(status[0].getExceptionMsg()); 5179 } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) { 5180 throw new NoSuchColumnFamilyException(status[0].getExceptionMsg()); 5181 } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.STORE_TOO_BUSY)) { 5182 throw new RegionTooBusyException(status[0].getExceptionMsg()); 5183 } 5184 return status[0]; 5185 } 5186 5187 /** 5188 * Complete taking the snapshot on the region. Writes the region info and adds references to the 5189 * working snapshot directory. TODO for api consistency, consider adding another version with no 5190 * {@link ForeignExceptionSnare} arg. (In the future other cancellable HRegion methods could 5191 * eventually add a {@link ForeignExceptionSnare}, or we could do something fancier). 5192 * @param desc snapshot description object 5193 * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to bail 5194 * out. This is allowed to be null and will just be ignored in that case. 5195 * @throws IOException if there is an external or internal error causing the snapshot to fail 5196 */ 5197 public void addRegionToSnapshot(SnapshotDescription desc, ForeignExceptionSnare exnSnare) 5198 throws IOException { 5199 Path rootDir = CommonFSUtils.getRootDir(conf); 5200 Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir, conf); 5201 5202 SnapshotManifest manifest = 5203 SnapshotManifest.create(conf, getFilesystem(), snapshotDir, desc, exnSnare); 5204 manifest.addRegion(this); 5205 } 5206 5207 private void updateSequenceId(final Iterable<List<Cell>> cellItr, final long sequenceId) 5208 throws IOException { 5209 for (List<Cell> cells : cellItr) { 5210 if (cells == null) return; 5211 for (Cell cell : cells) { 5212 PrivateCellUtil.setSequenceId(cell, sequenceId); 5213 } 5214 } 5215 } 5216 5217 /** 5218 * Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP} 5219 * provided current timestamp. 5220 */ 5221 private static void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now) 5222 throws IOException { 5223 for (List<Cell> cells : cellItr) { 5224 if (cells == null) continue; 5225 // Optimization: 'foreach' loop is not used. See: 5226 // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects 5227 assert cells instanceof RandomAccess; 5228 int listSize = cells.size(); 5229 for (int i = 0; i < listSize; i++) { 5230 PrivateCellUtil.updateLatestStamp(cells.get(i), now); 5231 } 5232 } 5233 } 5234 5235 /** 5236 * Possibly rewrite incoming cell tags. 5237 */ 5238 private void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) { 5239 // Check if we have any work to do and early out otherwise 5240 // Update these checks as more logic is added here 5241 if (m.getTTL() == Long.MAX_VALUE) { 5242 return; 5243 } 5244 5245 // From this point we know we have some work to do 5246 for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { 5247 List<Cell> cells = e.getValue(); 5248 assert cells instanceof RandomAccess; 5249 int listSize = cells.size(); 5250 for (int i = 0; i < listSize; i++) { 5251 Cell cell = cells.get(i); 5252 List<Tag> newTags = TagUtil.carryForwardTags(null, cell); 5253 newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL()); 5254 // Rewrite the cell with the updated set of tags 5255 cells.set(i, PrivateCellUtil.createCell(cell, newTags)); 5256 } 5257 } 5258 } 5259 5260 /** 5261 * Check if resources to support an update. 5262 * <p/> 5263 * We throw RegionTooBusyException if above memstore limit and expect client to retry using some 5264 * kind of backoff 5265 */ 5266 private void checkResources() throws RegionTooBusyException { 5267 // If catalog region, do not impose resource constraints or block updates. 5268 if (this.getRegionInfo().isMetaRegion()) { 5269 return; 5270 } 5271 5272 MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); 5273 if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) { 5274 blockedRequestsCount.increment(); 5275 requestFlush(); 5276 // Don't print current limit because it will vary too much. The message is used as a key 5277 // over in RetriesExhaustedWithDetailsException processing. 5278 final String regionName = 5279 this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getEncodedName(); 5280 final String serverName = this.getRegionServerServices() == null 5281 ? "unknown" 5282 : (this.getRegionServerServices().getServerName() == null 5283 ? "unknown" 5284 : this.getRegionServerServices().getServerName().toString()); 5285 RegionTooBusyException rtbe = new RegionTooBusyException("Over memstore limit=" 5286 + org.apache.hadoop.hbase.procedure2.util.StringUtils.humanSize(this.blockingMemStoreSize) 5287 + ", regionName=" + regionName + ", server=" + serverName); 5288 LOG.warn("Region is too busy due to exceeding memstore size limit.", rtbe); 5289 throw rtbe; 5290 } 5291 } 5292 5293 /** 5294 * @throws IOException Throws exception if region is in read-only mode. 5295 */ 5296 private void checkReadOnly() throws IOException { 5297 if (isReadOnly()) { 5298 throw new DoNotRetryIOException("region is read only"); 5299 } 5300 } 5301 5302 private void checkReadsEnabled() throws IOException { 5303 if (!this.writestate.readsEnabled) { 5304 throw new IOException(getRegionInfo().getEncodedName() 5305 + ": The region's reads are disabled. Cannot serve the request"); 5306 } 5307 } 5308 5309 public void setReadsEnabled(boolean readsEnabled) { 5310 if (readsEnabled && !this.writestate.readsEnabled) { 5311 LOG.info("Enabling reads for {}", getRegionInfo().getEncodedName()); 5312 } 5313 this.writestate.setReadsEnabled(readsEnabled); 5314 } 5315 5316 /** 5317 * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be 5318 * set; when set we will run operations that make sense in the increment/append 5319 * scenario but that do not make sense otherwise. 5320 */ 5321 private void applyToMemStore(HStore store, List<Cell> cells, boolean delta, 5322 MemStoreSizing memstoreAccounting) { 5323 // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! 5324 boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1; 5325 if (upsert) { 5326 store.upsert(cells, getSmallestReadPoint(), memstoreAccounting); 5327 } else { 5328 store.add(cells, memstoreAccounting); 5329 } 5330 } 5331 5332 private void checkFamilies(Collection<byte[]> families, Durability durability) 5333 throws NoSuchColumnFamilyException, InvalidMutationDurabilityException { 5334 for (byte[] family : families) { 5335 checkFamily(family, durability); 5336 } 5337 } 5338 5339 private void checkFamily(final byte[] family, Durability durability) 5340 throws NoSuchColumnFamilyException, InvalidMutationDurabilityException { 5341 checkFamily(family); 5342 if ( 5343 durability.equals(Durability.SKIP_WAL) 5344 && htableDescriptor.getColumnFamily(family).getScope() != HConstants.REPLICATION_SCOPE_LOCAL 5345 ) { 5346 throw new InvalidMutationDurabilityException( 5347 "Mutation's durability is SKIP_WAL but table's column family " + Bytes.toString(family) 5348 + " need replication"); 5349 } 5350 } 5351 5352 private void checkFamily(final byte[] family) throws NoSuchColumnFamilyException { 5353 if (!this.htableDescriptor.hasColumnFamily(family)) { 5354 throw new NoSuchColumnFamilyException("Column family " + Bytes.toString(family) 5355 + " does not exist in region " + this + " in table " + this.htableDescriptor); 5356 } 5357 } 5358 5359 /** 5360 * Check the collection of families for valid timestamps 5361 * @param now current timestamp 5362 */ 5363 public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now) 5364 throws FailedSanityCheckException { 5365 if (timestampSlop == HConstants.LATEST_TIMESTAMP) { 5366 return; 5367 } 5368 long maxTs = now + timestampSlop; 5369 for (List<Cell> kvs : familyMap.values()) { 5370 // Optimization: 'foreach' loop is not used. See: 5371 // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects 5372 assert kvs instanceof RandomAccess; 5373 int listSize = kvs.size(); 5374 for (int i = 0; i < listSize; i++) { 5375 Cell cell = kvs.get(i); 5376 // see if the user-side TS is out of range. latest = server-side 5377 long ts = cell.getTimestamp(); 5378 if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) { 5379 throw new FailedSanityCheckException( 5380 "Timestamp for KV out of range " + cell + " (too.new=" + timestampSlop + ")"); 5381 } 5382 } 5383 } 5384 } 5385 5386 /* 5387 * @return True if size is over the flush threshold 5388 */ 5389 private boolean isFlushSize(MemStoreSize size) { 5390 return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize(); 5391 } 5392 5393 private void deleteRecoveredEdits(FileSystem fs, Iterable<Path> files) throws IOException { 5394 for (Path file : files) { 5395 if (!fs.delete(file, false)) { 5396 LOG.error("Failed delete of {}", file); 5397 } else { 5398 LOG.debug("Deleted recovered.edits file={}", file); 5399 } 5400 } 5401 } 5402 5403 /** 5404 * Read the edits put under this region by wal splitting process. Put the recovered edits back up 5405 * into this region. 5406 * <p> 5407 * We can ignore any wal message that has a sequence ID that's equal to or lower than minSeqId. 5408 * (Because we know such messages are already reflected in the HFiles.) 5409 * <p> 5410 * While this is running we are putting pressure on memory yet we are outside of our usual 5411 * accounting because we are not yet an onlined region (this stuff is being run as part of Region 5412 * initialization). This means that if we're up against global memory limits, we'll not be flagged 5413 * to flush because we are not online. We can't be flushed by usual mechanisms anyways; we're not 5414 * yet online so our relative sequenceids are not yet aligned with WAL sequenceids -- not till we 5415 * come up online, post processing of split edits. 5416 * <p> 5417 * But to help relieve memory pressure, at least manage our own heap size flushing if are in 5418 * excess of per-region limits. Flushing, though, we have to be careful and avoid using the 5419 * regionserver/wal sequenceid. Its running on a different line to whats going on in here in this 5420 * region context so if we crashed replaying these edits, but in the midst had a flush that used 5421 * the regionserver wal with a sequenceid in excess of whats going on in here in this region and 5422 * with its split editlogs, then we could miss edits the next time we go to recover. So, we have 5423 * to flush inline, using seqids that make sense in a this single region context only -- until we 5424 * online. 5425 * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of the maxSeqId 5426 * for the store to be applied, else its skipped. 5427 * @return the sequence id of the last edit added to this region out of the recovered edits log or 5428 * <code>minSeqId</code> if nothing added from editlogs. 5429 */ 5430 long replayRecoveredEditsIfAny(Map<byte[], Long> maxSeqIdInStores, 5431 final CancelableProgressable reporter, final MonitoredTask status) throws IOException { 5432 long minSeqIdForTheRegion = -1; 5433 for (Long maxSeqIdInStore : maxSeqIdInStores.values()) { 5434 if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) { 5435 minSeqIdForTheRegion = maxSeqIdInStore; 5436 } 5437 } 5438 long seqId = minSeqIdForTheRegion; 5439 String specialRecoveredEditsDirStr = conf.get(SPECIAL_RECOVERED_EDITS_DIR); 5440 if (org.apache.commons.lang3.StringUtils.isBlank(specialRecoveredEditsDirStr)) { 5441 FileSystem walFS = getWalFileSystem(); 5442 FileSystem rootFS = getFilesystem(); 5443 Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(), 5444 getRegionInfo().getEncodedName()); 5445 Path regionWALDir = getWALRegionDir(); 5446 Path regionDir = 5447 FSUtils.getRegionDirFromRootDir(CommonFSUtils.getRootDir(conf), getRegionInfo()); 5448 5449 // We made a mistake in HBASE-20734 so we need to do this dirty hack... 5450 NavigableSet<Path> filesUnderWrongRegionWALDir = 5451 WALSplitUtil.getSplitEditFilesSorted(walFS, wrongRegionWALDir); 5452 seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, 5453 filesUnderWrongRegionWALDir, reporter, regionDir)); 5454 // This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear 5455 // under the root dir even if walDir is set. 5456 NavigableSet<Path> filesUnderRootDir = Collections.emptyNavigableSet(); 5457 if (!regionWALDir.equals(regionDir)) { 5458 filesUnderRootDir = WALSplitUtil.getSplitEditFilesSorted(rootFS, regionDir); 5459 seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS, 5460 filesUnderRootDir, reporter, regionDir)); 5461 } 5462 5463 NavigableSet<Path> files = WALSplitUtil.getSplitEditFilesSorted(walFS, regionWALDir); 5464 seqId = Math.max(seqId, 5465 replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, files, reporter, regionWALDir)); 5466 if (seqId > minSeqIdForTheRegion) { 5467 // Then we added some edits to memory. Flush and cleanup split edit files. 5468 internalFlushcache(null, seqId, stores.values(), status, false, 5469 FlushLifeCycleTracker.DUMMY); 5470 } 5471 // Now delete the content of recovered edits. We're done w/ them. 5472 if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) { 5473 // For debugging data loss issues! 5474 // If this flag is set, make use of the hfile archiving by making recovered.edits a fake 5475 // column family. Have to fake out file type too by casting our recovered.edits as 5476 // storefiles 5477 String fakeFamilyName = WALSplitUtil.getRegionDirRecoveredEditsDir(regionWALDir).getName(); 5478 Set<HStoreFile> fakeStoreFiles = new HashSet<>(files.size()); 5479 for (Path file : files) { 5480 fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true)); 5481 } 5482 getRegionWALFileSystem().archiveRecoveredEdits(fakeFamilyName, fakeStoreFiles); 5483 } else { 5484 deleteRecoveredEdits(walFS, Iterables.concat(files, filesUnderWrongRegionWALDir)); 5485 deleteRecoveredEdits(rootFS, filesUnderRootDir); 5486 } 5487 } else { 5488 Path recoveredEditsDir = new Path(specialRecoveredEditsDirStr); 5489 FileSystem fs = recoveredEditsDir.getFileSystem(conf); 5490 FileStatus[] files = fs.listStatus(recoveredEditsDir); 5491 LOG.debug("Found {} recovered edits file(s) under {}", files == null ? 0 : files.length, 5492 recoveredEditsDir); 5493 if (files != null) { 5494 for (FileStatus file : files) { 5495 // it is safe to trust the zero-length in this case because we've been through rename and 5496 // lease recovery in the above. 5497 if (isZeroLengthThenDelete(fs, file, file.getPath())) { 5498 continue; 5499 } 5500 seqId = 5501 Math.max(seqId, replayRecoveredEdits(file.getPath(), maxSeqIdInStores, reporter, fs)); 5502 } 5503 } 5504 if (seqId > minSeqIdForTheRegion) { 5505 // Then we added some edits to memory. Flush and cleanup split edit files. 5506 internalFlushcache(null, seqId, stores.values(), status, false, 5507 FlushLifeCycleTracker.DUMMY); 5508 } 5509 deleteRecoveredEdits(fs, 5510 Stream.of(files).map(FileStatus::getPath).collect(Collectors.toList())); 5511 } 5512 5513 return seqId; 5514 } 5515 5516 private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs, 5517 final NavigableSet<Path> files, final CancelableProgressable reporter, final Path regionDir) 5518 throws IOException { 5519 long seqid = minSeqIdForTheRegion; 5520 if (LOG.isDebugEnabled()) { 5521 LOG.debug("Found " + (files == null ? 0 : files.size()) + " recovered edits file(s) under " 5522 + regionDir); 5523 } 5524 5525 if (files == null || files.isEmpty()) { 5526 return minSeqIdForTheRegion; 5527 } 5528 5529 for (Path edits : files) { 5530 if (edits == null || !fs.exists(edits)) { 5531 LOG.warn("Null or non-existent edits file: " + edits); 5532 continue; 5533 } 5534 if (isZeroLengthThenDelete(fs, fs.getFileStatus(edits), edits)) { 5535 continue; 5536 } 5537 5538 long maxSeqId; 5539 String fileName = edits.getName(); 5540 maxSeqId = Math.abs(Long.parseLong(fileName)); 5541 if (maxSeqId <= minSeqIdForTheRegion) { 5542 if (LOG.isDebugEnabled()) { 5543 String msg = "Maximum sequenceid for this wal is " + maxSeqId 5544 + " and minimum sequenceid for the region " + this + " is " + minSeqIdForTheRegion 5545 + ", skipped the whole file, path=" + edits; 5546 LOG.debug(msg); 5547 } 5548 continue; 5549 } 5550 5551 try { 5552 // replay the edits. Replay can return -1 if everything is skipped, only update 5553 // if seqId is greater 5554 seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs)); 5555 } catch (IOException e) { 5556 handleException(fs, edits, e); 5557 } 5558 } 5559 return seqid; 5560 } 5561 5562 private void handleException(FileSystem fs, Path edits, IOException e) throws IOException { 5563 boolean skipErrors = conf.getBoolean(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS, 5564 conf.getBoolean("hbase.skip.errors", HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS)); 5565 if (conf.get("hbase.skip.errors") != null) { 5566 LOG.warn("The property 'hbase.skip.errors' has been deprecated. Please use " 5567 + HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead."); 5568 } 5569 if (skipErrors) { 5570 Path p = WALSplitUtil.moveAsideBadEditsFile(fs, edits); 5571 LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + "=true so continuing. Renamed " 5572 + edits + " as " + p, e); 5573 } else { 5574 throw e; 5575 } 5576 } 5577 5578 /** 5579 * @param edits File of recovered edits. 5580 * @param maxSeqIdInStores Maximum sequenceid found in each store. Edits in wal must be larger 5581 * than this to be replayed for each store. 5582 * @return the sequence id of the last edit added to this region out of the recovered edits log or 5583 * <code>minSeqId</code> if nothing added from editlogs. 5584 */ 5585 private long replayRecoveredEdits(final Path edits, Map<byte[], Long> maxSeqIdInStores, 5586 final CancelableProgressable reporter, FileSystem fs) throws IOException { 5587 String msg = "Replaying edits from " + edits; 5588 LOG.info(msg); 5589 MonitoredTask status = TaskMonitor.get().createStatus(msg); 5590 5591 status.setStatus("Opening recovered edits"); 5592 try (WALStreamReader reader = WALFactory.createStreamReader(fs, edits, conf)) { 5593 long currentEditSeqId = -1; 5594 long currentReplaySeqId = -1; 5595 long firstSeqIdInLog = -1; 5596 long skippedEdits = 0; 5597 long editsCount = 0; 5598 long intervalEdits = 0; 5599 WAL.Entry entry; 5600 HStore store = null; 5601 boolean reported_once = false; 5602 ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager(); 5603 5604 try { 5605 // How many edits seen before we check elapsed time 5606 int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000); 5607 // How often to send a progress report (default 1/2 master timeout) 5608 int period = this.conf.getInt("hbase.hstore.report.period", 300000); 5609 long lastReport = EnvironmentEdgeManager.currentTime(); 5610 5611 if (coprocessorHost != null) { 5612 coprocessorHost.preReplayWALs(this.getRegionInfo(), edits); 5613 } 5614 5615 while ((entry = reader.next()) != null) { 5616 WALKey key = entry.getKey(); 5617 WALEdit val = entry.getEdit(); 5618 5619 if (ng != null) { // some test, or nonces disabled 5620 ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime()); 5621 } 5622 5623 if (reporter != null) { 5624 intervalEdits += val.size(); 5625 if (intervalEdits >= interval) { 5626 // Number of edits interval reached 5627 intervalEdits = 0; 5628 long cur = EnvironmentEdgeManager.currentTime(); 5629 if (lastReport + period <= cur) { 5630 status.setStatus( 5631 "Replaying edits..." + " skipped=" + skippedEdits + " edits=" + editsCount); 5632 // Timeout reached 5633 if (!reporter.progress()) { 5634 msg = "Progressable reporter failed, stopping replay for region " + this; 5635 LOG.warn(msg); 5636 status.abort(msg); 5637 throw new IOException(msg); 5638 } 5639 reported_once = true; 5640 lastReport = cur; 5641 } 5642 } 5643 } 5644 5645 if (firstSeqIdInLog == -1) { 5646 firstSeqIdInLog = key.getSequenceId(); 5647 } 5648 if (currentEditSeqId > key.getSequenceId()) { 5649 // when this condition is true, it means we have a serious defect because we need to 5650 // maintain increasing SeqId for WAL edits per region 5651 LOG.error(getRegionInfo().getEncodedName() + " : " + "Found decreasing SeqId. PreId=" 5652 + currentEditSeqId + " key=" + key + "; edit=" + val); 5653 } else { 5654 currentEditSeqId = key.getSequenceId(); 5655 } 5656 currentReplaySeqId = 5657 (key.getOrigLogSeqNum() > 0) ? key.getOrigLogSeqNum() : currentEditSeqId; 5658 5659 boolean checkRowWithinBoundary = false; 5660 // Check this edit is for this region. 5661 if ( 5662 !Bytes.equals(key.getEncodedRegionName(), this.getRegionInfo().getEncodedNameAsBytes()) 5663 ) { 5664 checkRowWithinBoundary = true; 5665 } 5666 5667 boolean flush = false; 5668 MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing(); 5669 for (Cell cell : val.getCells()) { 5670 // Check this edit is for me. Also, guard against writing the special 5671 // METACOLUMN info such as HBASE::CACHEFLUSH entries 5672 if (WALEdit.isMetaEditFamily(cell)) { 5673 // if region names don't match, skipp replaying compaction marker 5674 if (!checkRowWithinBoundary) { 5675 // this is a special edit, we should handle it 5676 CompactionDescriptor compaction = WALEdit.getCompaction(cell); 5677 if (compaction != null) { 5678 // replay the compaction 5679 replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE); 5680 } 5681 } 5682 skippedEdits++; 5683 continue; 5684 } 5685 // Figure which store the edit is meant for. 5686 if ( 5687 store == null 5688 || !CellUtil.matchingFamily(cell, store.getColumnFamilyDescriptor().getName()) 5689 ) { 5690 store = getStore(cell); 5691 } 5692 if (store == null) { 5693 // This should never happen. Perhaps schema was changed between 5694 // crash and redeploy? 5695 LOG.warn("No family for cell {} in region {}", cell, this); 5696 skippedEdits++; 5697 continue; 5698 } 5699 if ( 5700 checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(), cell.getRowArray(), 5701 cell.getRowOffset(), cell.getRowLength()) 5702 ) { 5703 LOG.warn("Row of {} is not within region boundary for region {}", cell, this); 5704 skippedEdits++; 5705 continue; 5706 } 5707 // Now, figure if we should skip this edit. 5708 if ( 5709 key.getSequenceId() 5710 <= maxSeqIdInStores.get(store.getColumnFamilyDescriptor().getName()) 5711 ) { 5712 skippedEdits++; 5713 continue; 5714 } 5715 PrivateCellUtil.setSequenceId(cell, currentReplaySeqId); 5716 5717 restoreEdit(store, cell, memStoreSizing); 5718 editsCount++; 5719 } 5720 MemStoreSize mss = memStoreSizing.getMemStoreSize(); 5721 incMemStoreSize(mss); 5722 flush = isFlushSize(this.memStoreSizing.getMemStoreSize()); 5723 if (flush) { 5724 internalFlushcache(null, currentEditSeqId, stores.values(), status, false, 5725 FlushLifeCycleTracker.DUMMY); 5726 } 5727 } 5728 5729 if (coprocessorHost != null) { 5730 coprocessorHost.postReplayWALs(this.getRegionInfo(), edits); 5731 } 5732 } catch (EOFException eof) { 5733 if (!conf.getBoolean(RECOVERED_EDITS_IGNORE_EOF, false)) { 5734 Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); 5735 msg = "EnLongAddered EOF. Most likely due to Master failure during " 5736 + "wal splitting, so we have this data in another edit. Continuing, but renaming " 5737 + edits + " as " + p + " for region " + this; 5738 LOG.warn(msg, eof); 5739 status.abort(msg); 5740 } else { 5741 LOG.warn("EOF while replaying recover edits and config '{}' is true so " 5742 + "we will ignore it and continue", RECOVERED_EDITS_IGNORE_EOF, eof); 5743 } 5744 } catch (IOException ioe) { 5745 // If the IOE resulted from bad file format, 5746 // then this problem is idempotent and retrying won't help 5747 if (ioe.getCause() instanceof ParseException) { 5748 Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); 5749 msg = 5750 "File corruption enLongAddered! " + "Continuing, but renaming " + edits + " as " + p; 5751 LOG.warn(msg, ioe); 5752 status.setStatus(msg); 5753 } else { 5754 status.abort(StringUtils.stringifyException(ioe)); 5755 // other IO errors may be transient (bad network connection, 5756 // checksum exception on one datanode, etc). throw & retry 5757 throw ioe; 5758 } 5759 } 5760 if (reporter != null && !reported_once) { 5761 reporter.progress(); 5762 } 5763 msg = "Applied " + editsCount + ", skipped " + skippedEdits + ", firstSequenceIdInLog=" 5764 + firstSeqIdInLog + ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits; 5765 status.markComplete(msg); 5766 LOG.debug(msg); 5767 return currentEditSeqId; 5768 } finally { 5769 status.cleanup(); 5770 } 5771 } 5772 5773 /** 5774 * Call to complete a compaction. Its for the case where we find in the WAL a compaction that was 5775 * not finished. We could find one recovering a WAL after a regionserver crash. See HBASE-2331. 5776 */ 5777 void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles, 5778 boolean removeFiles, long replaySeqId) throws IOException { 5779 try { 5780 checkTargetRegion(compaction.getEncodedRegionName().toByteArray(), 5781 "Compaction marker from WAL ", compaction); 5782 } catch (WrongRegionException wre) { 5783 if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 5784 // skip the compaction marker since it is not for this region 5785 return; 5786 } 5787 throw wre; 5788 } 5789 5790 synchronized (writestate) { 5791 if (replaySeqId < lastReplayedOpenRegionSeqId) { 5792 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" 5793 + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId 5794 + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " 5795 + lastReplayedOpenRegionSeqId); 5796 return; 5797 } 5798 if (replaySeqId < lastReplayedCompactionSeqId) { 5799 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" 5800 + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId 5801 + " is smaller than this regions " + "lastReplayedCompactionSeqId of " 5802 + lastReplayedCompactionSeqId); 5803 return; 5804 } else { 5805 lastReplayedCompactionSeqId = replaySeqId; 5806 } 5807 5808 if (LOG.isDebugEnabled()) { 5809 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying compaction marker " 5810 + TextFormat.shortDebugString(compaction) + " with seqId=" + replaySeqId 5811 + " and lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); 5812 } 5813 5814 startRegionOperation(Operation.REPLAY_EVENT); 5815 try { 5816 HStore store = this.getStore(compaction.getFamilyName().toByteArray()); 5817 if (store == null) { 5818 LOG.warn(getRegionInfo().getEncodedName() + " : " 5819 + "Found Compaction WAL edit for deleted family:" 5820 + Bytes.toString(compaction.getFamilyName().toByteArray())); 5821 return; 5822 } 5823 store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles); 5824 logRegionFiles(); 5825 } catch (FileNotFoundException ex) { 5826 LOG.warn(getRegionInfo().getEncodedName() + " : " 5827 + "At least one of the store files in compaction: " 5828 + TextFormat.shortDebugString(compaction) 5829 + " doesn't exist any more. Skip loading the file(s)", ex); 5830 } finally { 5831 closeRegionOperation(Operation.REPLAY_EVENT); 5832 } 5833 } 5834 } 5835 5836 /** 5837 * @deprecated Since 3.0.0, will be removed in 4.0.0. Only for keep compatibility for old region 5838 * replica implementation. 5839 */ 5840 @Deprecated 5841 void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException { 5842 checkTargetRegion(flush.getEncodedRegionName().toByteArray(), "Flush marker from WAL ", flush); 5843 5844 if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 5845 return; // if primary nothing to do 5846 } 5847 5848 if (LOG.isDebugEnabled()) { 5849 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying flush marker " 5850 + TextFormat.shortDebugString(flush)); 5851 } 5852 5853 startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close 5854 try { 5855 FlushAction action = flush.getAction(); 5856 switch (action) { 5857 case START_FLUSH: 5858 replayWALFlushStartMarker(flush); 5859 break; 5860 case COMMIT_FLUSH: 5861 replayWALFlushCommitMarker(flush); 5862 break; 5863 case ABORT_FLUSH: 5864 replayWALFlushAbortMarker(flush); 5865 break; 5866 case CANNOT_FLUSH: 5867 replayWALFlushCannotFlushMarker(flush, replaySeqId); 5868 break; 5869 default: 5870 LOG.warn(getRegionInfo().getEncodedName() + " : " 5871 + "Received a flush event with unknown action, ignoring. " 5872 + TextFormat.shortDebugString(flush)); 5873 break; 5874 } 5875 5876 logRegionFiles(); 5877 } finally { 5878 closeRegionOperation(Operation.REPLAY_EVENT); 5879 } 5880 } 5881 5882 private Collection<HStore> getStoresToFlush(FlushDescriptor flushDesc) { 5883 List<HStore> storesToFlush = new ArrayList<>(); 5884 for (StoreFlushDescriptor storeFlush : flushDesc.getStoreFlushesList()) { 5885 byte[] family = storeFlush.getFamilyName().toByteArray(); 5886 HStore store = getStore(family); 5887 if (store == null) { 5888 LOG.warn(getRegionInfo().getEncodedName() + " : " 5889 + "Received a flush start marker from primary, but the family is not found. Ignoring" 5890 + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush)); 5891 continue; 5892 } 5893 storesToFlush.add(store); 5894 } 5895 return storesToFlush; 5896 } 5897 5898 /** 5899 * Replay the flush marker from primary region by creating a corresponding snapshot of the store 5900 * memstores, only if the memstores do not have a higher seqId from an earlier wal edit (because 5901 * the events may be coming out of order). 5902 * @deprecated Since 3.0.0, will be removed in 4.0.0. Only for keep compatibility for old region 5903 * replica implementation. 5904 */ 5905 @Deprecated 5906 PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException { 5907 long flushSeqId = flush.getFlushSequenceNumber(); 5908 5909 Collection<HStore> storesToFlush = getStoresToFlush(flush); 5910 5911 MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this); 5912 5913 // we will use writestate as a coarse-grain lock for all the replay events 5914 // (flush, compaction, region open etc) 5915 synchronized (writestate) { 5916 try { 5917 if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { 5918 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" 5919 + TextFormat.shortDebugString(flush) 5920 + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " 5921 + " of " + lastReplayedOpenRegionSeqId); 5922 return null; 5923 } 5924 if (numMutationsWithoutWAL.sum() > 0) { 5925 numMutationsWithoutWAL.reset(); 5926 dataInMemoryWithoutWAL.reset(); 5927 } 5928 5929 if (!writestate.flushing) { 5930 // we do not have an active snapshot and corresponding this.prepareResult. This means 5931 // we can just snapshot our memstores and continue as normal. 5932 5933 // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal 5934 PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId, 5935 storesToFlush, status, false, FlushLifeCycleTracker.DUMMY); 5936 if (prepareResult.result == null) { 5937 // save the PrepareFlushResult so that we can use it later from commit flush 5938 this.writestate.flushing = true; 5939 this.prepareFlushResult = prepareResult; 5940 status.markComplete("Flush prepare successful"); 5941 if (LOG.isDebugEnabled()) { 5942 LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared flush with seqId:" 5943 + flush.getFlushSequenceNumber()); 5944 } 5945 } else { 5946 // special case empty memstore. We will still save the flush result in this case, since 5947 // our memstore ie empty, but the primary is still flushing 5948 if ( 5949 prepareResult.getResult().getResult() 5950 == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY 5951 ) { 5952 this.writestate.flushing = true; 5953 this.prepareFlushResult = prepareResult; 5954 if (LOG.isDebugEnabled()) { 5955 LOG.debug(getRegionInfo().getEncodedName() + " : " 5956 + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber()); 5957 } 5958 } 5959 status.abort("Flush prepare failed with " + prepareResult.result); 5960 // nothing much to do. prepare flush failed because of some reason. 5961 } 5962 return prepareResult; 5963 } else { 5964 // we already have an active snapshot. 5965 if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) { 5966 // They define the same flush. Log and continue. 5967 LOG.warn(getRegionInfo().getEncodedName() + " : " 5968 + "Received a flush prepare marker with the same seqId: " 5969 + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " 5970 + prepareFlushResult.flushOpSeqId + ". Ignoring"); 5971 // ignore 5972 } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) { 5973 // We received a flush with a smaller seqNum than what we have prepared. We can only 5974 // ignore this prepare flush request. 5975 LOG.warn(getRegionInfo().getEncodedName() + " : " 5976 + "Received a flush prepare marker with a smaller seqId: " 5977 + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " 5978 + prepareFlushResult.flushOpSeqId + ". Ignoring"); 5979 // ignore 5980 } else { 5981 // We received a flush with a larger seqNum than what we have prepared 5982 LOG.warn(getRegionInfo().getEncodedName() + " : " 5983 + "Received a flush prepare marker with a larger seqId: " 5984 + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " 5985 + prepareFlushResult.flushOpSeqId + ". Ignoring"); 5986 // We do not have multiple active snapshots in the memstore or a way to merge current 5987 // memstore snapshot with the contents and resnapshot for now. We cannot take 5988 // another snapshot and drop the previous one because that will cause temporary 5989 // data loss in the secondary. So we ignore this for now, deferring the resolution 5990 // to happen when we see the corresponding flush commit marker. If we have a memstore 5991 // snapshot with x, and later received another prepare snapshot with y (where x < y), 5992 // when we see flush commit for y, we will drop snapshot for x, and can also drop all 5993 // the memstore edits if everything in memstore is < y. This is the usual case for 5994 // RS crash + recovery where we might see consequtive prepare flush wal markers. 5995 // Otherwise, this will cause more memory to be used in secondary replica until a 5996 // further prapare + commit flush is seen and replayed. 5997 } 5998 } 5999 } finally { 6000 status.cleanup(); 6001 writestate.notifyAll(); 6002 } 6003 } 6004 return null; 6005 } 6006 6007 /** 6008 * @deprecated Since 3.0.0, will be removed in 4.0.0. Only for keep compatibility for old region 6009 * replica implementation. 6010 */ 6011 @Deprecated 6012 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 6013 justification = "Intentional; post memstore flush") 6014 void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException { 6015 MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this); 6016 6017 // check whether we have the memstore snapshot with the corresponding seqId. Replay to 6018 // secondary region replicas are in order, except for when the region moves or then the 6019 // region server crashes. In those cases, we may receive replay requests out of order from 6020 // the original seqIds. 6021 synchronized (writestate) { 6022 try { 6023 if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { 6024 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" 6025 + TextFormat.shortDebugString(flush) 6026 + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " 6027 + " of " + lastReplayedOpenRegionSeqId); 6028 return; 6029 } 6030 6031 if (writestate.flushing) { 6032 PrepareFlushResult prepareFlushResult = this.prepareFlushResult; 6033 if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) { 6034 if (LOG.isDebugEnabled()) { 6035 LOG.debug(getRegionInfo().getEncodedName() + " : " 6036 + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber() 6037 + " and a previous prepared snapshot was found"); 6038 } 6039 // This is the regular case where we received commit flush after prepare flush 6040 // corresponding to the same seqId. 6041 replayFlushInStores(flush, prepareFlushResult, true); 6042 6043 // Set down the memstore size by amount of flush. 6044 this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); 6045 this.prepareFlushResult = null; 6046 writestate.flushing = false; 6047 } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) { 6048 // This should not happen normally. However, lets be safe and guard against these cases 6049 // we received a flush commit with a smaller seqId than what we have prepared 6050 // we will pick the flush file up from this commit (if we have not seen it), but we 6051 // will not drop the memstore 6052 LOG.warn(getRegionInfo().getEncodedName() + " : " 6053 + "Received a flush commit marker with smaller seqId: " 6054 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " 6055 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping" 6056 + " prepared memstore snapshot"); 6057 replayFlushInStores(flush, prepareFlushResult, false); 6058 6059 // snapshot is not dropped, so memstore sizes should not be decremented 6060 // we still have the prepared snapshot, flushing should still be true 6061 } else { 6062 // This should not happen normally. However, lets be safe and guard against these cases 6063 // we received a flush commit with a larger seqId than what we have prepared 6064 // we will pick the flush file for this. We will also obtain the updates lock and 6065 // look for contents of the memstore to see whether we have edits after this seqId. 6066 // If not, we will drop all the memstore edits and the snapshot as well. 6067 LOG.warn(getRegionInfo().getEncodedName() + " : " 6068 + "Received a flush commit marker with larger seqId: " 6069 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " 6070 + prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared" 6071 + " memstore snapshot"); 6072 6073 replayFlushInStores(flush, prepareFlushResult, true); 6074 6075 // Set down the memstore size by amount of flush. 6076 this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); 6077 6078 // Inspect the memstore contents to see whether the memstore contains only edits 6079 // with seqId smaller than the flush seqId. If so, we can discard those edits. 6080 dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); 6081 6082 this.prepareFlushResult = null; 6083 writestate.flushing = false; 6084 } 6085 // If we were waiting for observing a flush or region opening event for not showing 6086 // partial data after a secondary region crash, we can allow reads now. We can only make 6087 // sure that we are not showing partial data (for example skipping some previous edits) 6088 // until we observe a full flush start and flush commit. So if we were not able to find 6089 // a previous flush we will not enable reads now. 6090 this.setReadsEnabled(true); 6091 } else { 6092 LOG.warn( 6093 getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:" 6094 + flush.getFlushSequenceNumber() + ", but no previous prepared snapshot was found"); 6095 // There is no corresponding prepare snapshot from before. 6096 // We will pick up the new flushed file 6097 replayFlushInStores(flush, null, false); 6098 6099 // Inspect the memstore contents to see whether the memstore contains only edits 6100 // with seqId smaller than the flush seqId. If so, we can discard those edits. 6101 dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); 6102 } 6103 6104 status.markComplete("Flush commit successful"); 6105 6106 // Update the last flushed sequence id for region. 6107 this.maxFlushedSeqId = flush.getFlushSequenceNumber(); 6108 6109 // advance the mvcc read point so that the new flushed file is visible. 6110 mvcc.advanceTo(flush.getFlushSequenceNumber()); 6111 6112 } catch (FileNotFoundException ex) { 6113 LOG.warn(getRegionInfo().getEncodedName() + " : " 6114 + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush) 6115 + " doesn't exist any more. Skip loading the file(s)", ex); 6116 } finally { 6117 status.cleanup(); 6118 writestate.notifyAll(); 6119 } 6120 } 6121 6122 // C. Finally notify anyone waiting on memstore to clear: 6123 // e.g. checkResources(). 6124 synchronized (this) { 6125 notifyAll(); // FindBugs NN_NAKED_NOTIFY 6126 } 6127 } 6128 6129 /** 6130 * Replays the given flush descriptor by opening the flush files in stores and dropping the 6131 * memstore snapshots if requested. 6132 * @deprecated Since 3.0.0, will be removed in 4.0.0. Only for keep compatibility for old region 6133 * replica implementation. 6134 */ 6135 @Deprecated 6136 private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult, 6137 boolean dropMemstoreSnapshot) throws IOException { 6138 for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { 6139 byte[] family = storeFlush.getFamilyName().toByteArray(); 6140 HStore store = getStore(family); 6141 if (store == null) { 6142 LOG.warn(getRegionInfo().getEncodedName() + " : " 6143 + "Received a flush commit marker from primary, but the family is not found." 6144 + "Ignoring StoreFlushDescriptor:" + storeFlush); 6145 continue; 6146 } 6147 List<String> flushFiles = storeFlush.getFlushOutputList(); 6148 StoreFlushContext ctx = null; 6149 long startTime = EnvironmentEdgeManager.currentTime(); 6150 if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) { 6151 ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY); 6152 } else { 6153 ctx = prepareFlushResult.storeFlushCtxs.get(family); 6154 startTime = prepareFlushResult.startTime; 6155 } 6156 6157 if (ctx == null) { 6158 LOG.warn(getRegionInfo().getEncodedName() + " : " 6159 + "Unexpected: flush commit marker received from store " + Bytes.toString(family) 6160 + " but no associated flush context. Ignoring"); 6161 continue; 6162 } 6163 6164 ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush 6165 6166 // Record latest flush time 6167 this.lastStoreFlushTimeMap.put(store, startTime); 6168 } 6169 } 6170 6171 private long loadRecoveredHFilesIfAny(Collection<HStore> stores) throws IOException { 6172 Path regionDir = fs.getRegionDir(); 6173 long maxSeqId = -1; 6174 for (HStore store : stores) { 6175 String familyName = store.getColumnFamilyName(); 6176 FileStatus[] files = 6177 WALSplitUtil.getRecoveredHFiles(fs.getFileSystem(), regionDir, familyName); 6178 if (files != null && files.length != 0) { 6179 for (FileStatus file : files) { 6180 Path filePath = file.getPath(); 6181 // If file length is zero then delete it 6182 if (isZeroLengthThenDelete(fs.getFileSystem(), file, filePath)) { 6183 continue; 6184 } 6185 try { 6186 HStoreFile storefile = store.tryCommitRecoveredHFile(file.getPath()); 6187 maxSeqId = Math.max(maxSeqId, storefile.getReader().getSequenceID()); 6188 } catch (IOException e) { 6189 handleException(fs.getFileSystem(), filePath, e); 6190 continue; 6191 } 6192 } 6193 if (this.rsServices != null && store.needsCompaction()) { 6194 this.rsServices.getCompactionRequestor().requestCompaction(this, store, 6195 "load recovered hfiles request compaction", Store.PRIORITY_USER + 1, 6196 CompactionLifeCycleTracker.DUMMY, null); 6197 } 6198 } 6199 } 6200 return maxSeqId; 6201 } 6202 6203 /** 6204 * Be careful, this method will drop all data in the memstore of this region. Currently, this 6205 * method is used to drop memstore to prevent memory leak when replaying recovered.edits while 6206 * opening region. 6207 */ 6208 private MemStoreSize dropMemStoreContents() throws IOException { 6209 MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); 6210 this.updatesLock.writeLock().lock(); 6211 try { 6212 for (HStore s : stores.values()) { 6213 MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM); 6214 LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region " 6215 + this.getRegionInfo().getRegionNameAsString() + " , dropped memstoresize: [" 6216 + memStoreSize + " }"); 6217 totalFreedSize.incMemStoreSize(memStoreSize); 6218 } 6219 return totalFreedSize.getMemStoreSize(); 6220 } finally { 6221 this.updatesLock.writeLock().unlock(); 6222 } 6223 } 6224 6225 /** 6226 * Drops the memstore contents after replaying a flush descriptor or region open event replay if 6227 * the memstore edits have seqNums smaller than the given seq id 6228 */ 6229 private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException { 6230 MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); 6231 this.updatesLock.writeLock().lock(); 6232 try { 6233 6234 long currentSeqId = mvcc.getReadPoint(); 6235 if (seqId >= currentSeqId) { 6236 // then we can drop the memstore contents since everything is below this seqId 6237 LOG.info(getRegionInfo().getEncodedName() + " : " 6238 + "Dropping memstore contents as well since replayed flush seqId: " + seqId 6239 + " is greater than current seqId:" + currentSeqId); 6240 6241 // Prepare flush (take a snapshot) and then abort (drop the snapshot) 6242 if (store == null) { 6243 for (HStore s : stores.values()) { 6244 totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId)); 6245 } 6246 } else { 6247 totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId)); 6248 } 6249 } else { 6250 LOG.info(getRegionInfo().getEncodedName() + " : " 6251 + "Not dropping memstore contents since replayed flush seqId: " + seqId 6252 + " is smaller than current seqId:" + currentSeqId); 6253 } 6254 } finally { 6255 this.updatesLock.writeLock().unlock(); 6256 } 6257 return totalFreedSize.getMemStoreSize(); 6258 } 6259 6260 private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId) 6261 throws IOException { 6262 MemStoreSize flushableSize = s.getFlushableSize(); 6263 this.decrMemStoreSize(flushableSize); 6264 StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY); 6265 ctx.prepare(); 6266 ctx.abort(); 6267 return flushableSize; 6268 } 6269 6270 private void replayWALFlushAbortMarker(FlushDescriptor flush) { 6271 // nothing to do for now. A flush abort will cause a RS abort which means that the region 6272 // will be opened somewhere else later. We will see the region open event soon, and replaying 6273 // that will drop the snapshot 6274 } 6275 6276 private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) { 6277 synchronized (writestate) { 6278 if (this.lastReplayedOpenRegionSeqId > replaySeqId) { 6279 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" 6280 + TextFormat.shortDebugString(flush) + " because its sequence id " + replaySeqId 6281 + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " 6282 + lastReplayedOpenRegionSeqId); 6283 return; 6284 } 6285 6286 // If we were waiting for observing a flush or region opening event for not showing partial 6287 // data after a secondary region crash, we can allow reads now. This event means that the 6288 // primary was not able to flush because memstore is empty when we requested flush. By the 6289 // time we observe this, we are guaranteed to have up to date seqId with our previous 6290 // assignment. 6291 this.setReadsEnabled(true); 6292 } 6293 } 6294 6295 PrepareFlushResult getPrepareFlushResult() { 6296 return prepareFlushResult; 6297 } 6298 6299 /** 6300 * @deprecated Since 3.0.0, will be removed in 4.0.0. Only for keep compatibility for old region 6301 * replica implementation. 6302 */ 6303 @Deprecated 6304 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 6305 justification = "Intentional; cleared the memstore") 6306 void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException { 6307 checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(), 6308 "RegionEvent marker from WAL ", regionEvent); 6309 6310 startRegionOperation(Operation.REPLAY_EVENT); 6311 try { 6312 if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 6313 return; // if primary nothing to do 6314 } 6315 6316 if (regionEvent.getEventType() == EventType.REGION_CLOSE) { 6317 // nothing to do on REGION_CLOSE for now. 6318 return; 6319 } 6320 if (regionEvent.getEventType() != EventType.REGION_OPEN) { 6321 LOG.warn(getRegionInfo().getEncodedName() + " : " 6322 + "Unknown region event received, ignoring :" + TextFormat.shortDebugString(regionEvent)); 6323 return; 6324 } 6325 6326 if (LOG.isDebugEnabled()) { 6327 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying region open event marker " 6328 + TextFormat.shortDebugString(regionEvent)); 6329 } 6330 6331 // we will use writestate as a coarse-grain lock for all the replay events 6332 synchronized (writestate) { 6333 // Replication can deliver events out of order when primary region moves or the region 6334 // server crashes, since there is no coordination between replication of different wal files 6335 // belonging to different region servers. We have to safe guard against this case by using 6336 // region open event's seqid. Since this is the first event that the region puts (after 6337 // possibly flushing recovered.edits), after seeing this event, we can ignore every edit 6338 // smaller than this seqId 6339 if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) { 6340 this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber(); 6341 } else { 6342 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying region event :" 6343 + TextFormat.shortDebugString(regionEvent) 6344 + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " 6345 + " of " + lastReplayedOpenRegionSeqId); 6346 return; 6347 } 6348 6349 // region open lists all the files that the region has at the time of the opening. Just pick 6350 // all the files and drop prepared flushes and empty memstores 6351 for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) { 6352 // stores of primary may be different now 6353 byte[] family = storeDescriptor.getFamilyName().toByteArray(); 6354 HStore store = getStore(family); 6355 if (store == null) { 6356 LOG.warn(getRegionInfo().getEncodedName() + " : " 6357 + "Received a region open marker from primary, but the family is not found. " 6358 + "Ignoring. StoreDescriptor:" + storeDescriptor); 6359 continue; 6360 } 6361 6362 long storeSeqId = store.getMaxSequenceId().orElse(0L); 6363 List<String> storeFiles = storeDescriptor.getStoreFileList(); 6364 try { 6365 store.refreshStoreFiles(storeFiles); // replace the files with the new ones 6366 } catch (FileNotFoundException ex) { 6367 LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files: " 6368 + storeFiles + " doesn't exist any more. Skip loading the file(s)", ex); 6369 continue; 6370 } 6371 if (store.getMaxSequenceId().orElse(0L) != storeSeqId) { 6372 // Record latest flush time if we picked up new files 6373 lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime()); 6374 } 6375 6376 if (writestate.flushing) { 6377 // only drop memstore snapshots if they are smaller than last flush for the store 6378 if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) { 6379 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null 6380 ? null 6381 : this.prepareFlushResult.storeFlushCtxs.get(family); 6382 if (ctx != null) { 6383 MemStoreSize mss = store.getFlushableSize(); 6384 ctx.abort(); 6385 this.decrMemStoreSize(mss); 6386 this.prepareFlushResult.storeFlushCtxs.remove(family); 6387 } 6388 } 6389 } 6390 6391 // Drop the memstore contents if they are now smaller than the latest seen flushed file 6392 dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store); 6393 if (storeSeqId > this.maxFlushedSeqId) { 6394 this.maxFlushedSeqId = storeSeqId; 6395 } 6396 } 6397 6398 // if all stores ended up dropping their snapshots, we can safely drop the 6399 // prepareFlushResult 6400 dropPrepareFlushIfPossible(); 6401 6402 // advance the mvcc read point so that the new flushed file is visible. 6403 mvcc.await(); 6404 6405 // If we were waiting for observing a flush or region opening event for not showing partial 6406 // data after a secondary region crash, we can allow reads now. 6407 this.setReadsEnabled(true); 6408 6409 // C. Finally notify anyone waiting on memstore to clear: 6410 // e.g. checkResources(). 6411 synchronized (this) { 6412 notifyAll(); // FindBugs NN_NAKED_NOTIFY 6413 } 6414 } 6415 logRegionFiles(); 6416 } finally { 6417 closeRegionOperation(Operation.REPLAY_EVENT); 6418 } 6419 } 6420 6421 /** 6422 * @deprecated Since 3.0.0, will be removed in 4.0.0. Only for keep compatibility for old region 6423 * replica implementation. 6424 */ 6425 @Deprecated 6426 void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException { 6427 checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(), 6428 "BulkLoad marker from WAL ", bulkLoadEvent); 6429 6430 if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 6431 return; // if primary nothing to do 6432 } 6433 6434 if (LOG.isDebugEnabled()) { 6435 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying bulkload event marker " 6436 + TextFormat.shortDebugString(bulkLoadEvent)); 6437 } 6438 // check if multiple families involved 6439 boolean multipleFamilies = false; 6440 byte[] family = null; 6441 for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { 6442 byte[] fam = storeDescriptor.getFamilyName().toByteArray(); 6443 if (family == null) { 6444 family = fam; 6445 } else if (!Bytes.equals(family, fam)) { 6446 multipleFamilies = true; 6447 break; 6448 } 6449 } 6450 6451 startBulkRegionOperation(multipleFamilies); 6452 try { 6453 // we will use writestate as a coarse-grain lock for all the replay events 6454 synchronized (writestate) { 6455 // Replication can deliver events out of order when primary region moves or the region 6456 // server crashes, since there is no coordination between replication of different wal files 6457 // belonging to different region servers. We have to safe guard against this case by using 6458 // region open event's seqid. Since this is the first event that the region puts (after 6459 // possibly flushing recovered.edits), after seeing this event, we can ignore every edit 6460 // smaller than this seqId 6461 if ( 6462 bulkLoadEvent.getBulkloadSeqNum() >= 0 6463 && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum() 6464 ) { 6465 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying bulkload event :" 6466 + TextFormat.shortDebugString(bulkLoadEvent) 6467 + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId" 6468 + " =" + lastReplayedOpenRegionSeqId); 6469 6470 return; 6471 } 6472 6473 for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { 6474 // stores of primary may be different now 6475 family = storeDescriptor.getFamilyName().toByteArray(); 6476 HStore store = getStore(family); 6477 if (store == null) { 6478 LOG.warn(getRegionInfo().getEncodedName() + " : " 6479 + "Received a bulk load marker from primary, but the family is not found. " 6480 + "Ignoring. StoreDescriptor:" + storeDescriptor); 6481 continue; 6482 } 6483 6484 List<String> storeFiles = storeDescriptor.getStoreFileList(); 6485 for (String storeFile : storeFiles) { 6486 StoreFileInfo storeFileInfo = null; 6487 try { 6488 storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile); 6489 store.bulkLoadHFile(storeFileInfo); 6490 } catch (FileNotFoundException ex) { 6491 LOG.warn(getRegionInfo().getEncodedName() + " : " 6492 + ((storeFileInfo != null) 6493 ? storeFileInfo.toString() 6494 : (new Path(Bytes.toString(family), storeFile)).toString()) 6495 + " doesn't exist any more. Skip loading the file"); 6496 } 6497 } 6498 } 6499 } 6500 if (bulkLoadEvent.getBulkloadSeqNum() > 0) { 6501 mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum()); 6502 } 6503 } finally { 6504 closeBulkRegionOperation(); 6505 } 6506 } 6507 6508 /** 6509 * Replay the batch mutate for secondary replica. 6510 * <p/> 6511 * We will directly apply the cells to the memstore. This is because: 6512 * <ol> 6513 * <li>All the cells are gotten from {@link WALEdit}, so we only have {@link Put} and 6514 * {@link Delete} here</li> 6515 * <li>The replay is single threaded, we do not need to acquire row lock, as the region is read 6516 * only so no one else can write it.</li> 6517 * <li>We do not need to write WAL.</li> 6518 * <li>We will advance MVCC in the caller directly.</li> 6519 * </ol> 6520 */ 6521 private void replayWALBatchMutate(Map<byte[], List<Cell>> family2Cells) throws IOException { 6522 startRegionOperation(Operation.REPLAY_BATCH_MUTATE); 6523 try { 6524 for (Map.Entry<byte[], List<Cell>> entry : family2Cells.entrySet()) { 6525 applyToMemStore(getStore(entry.getKey()), entry.getValue(), false, memStoreSizing); 6526 } 6527 } finally { 6528 closeRegionOperation(Operation.REPLAY_BATCH_MUTATE); 6529 } 6530 } 6531 6532 /** 6533 * Replay the meta edits, i.e, flush marker, compaction marker, bulk load marker, region event 6534 * marker, etc. 6535 * <p/> 6536 * For all events other than start flush, we will just call {@link #refreshStoreFiles()} as the 6537 * logic is straight-forward and robust. For start flush, we need to snapshot the memstore, so 6538 * later {@link #refreshStoreFiles()} call could drop the snapshot, otherwise we may run out of 6539 * memory. 6540 */ 6541 private void replayWALMetaEdit(Cell cell) throws IOException { 6542 startRegionOperation(Operation.REPLAY_EVENT); 6543 try { 6544 FlushDescriptor flushDesc = WALEdit.getFlushDescriptor(cell); 6545 if (flushDesc != null) { 6546 switch (flushDesc.getAction()) { 6547 case START_FLUSH: 6548 // for start flush, we need to take a snapshot of the current memstore 6549 synchronized (writestate) { 6550 if (!writestate.flushing) { 6551 this.writestate.flushing = true; 6552 } else { 6553 // usually this should not happen but let's make the code more robust, it is not a 6554 // big deal to just ignore it, the refreshStoreFiles call should have the ability to 6555 // clean up the inconsistent state. 6556 LOG.debug("NOT flushing {} as already flushing", getRegionInfo()); 6557 break; 6558 } 6559 } 6560 MonitoredTask status = 6561 TaskMonitor.get().createStatus("Preparing flush " + getRegionInfo()); 6562 Collection<HStore> storesToFlush = getStoresToFlush(flushDesc); 6563 try { 6564 PrepareFlushResult prepareResult = 6565 internalPrepareFlushCache(null, flushDesc.getFlushSequenceNumber(), storesToFlush, 6566 status, false, FlushLifeCycleTracker.DUMMY); 6567 if (prepareResult.result == null) { 6568 // save the PrepareFlushResult so that we can use it later from commit flush 6569 this.prepareFlushResult = prepareResult; 6570 status.markComplete("Flush prepare successful"); 6571 if (LOG.isDebugEnabled()) { 6572 LOG.debug("{} prepared flush with seqId: {}", getRegionInfo(), 6573 flushDesc.getFlushSequenceNumber()); 6574 } 6575 } else { 6576 // special case empty memstore. We will still save the flush result in this case, 6577 // since our memstore is empty, but the primary is still flushing 6578 if ( 6579 prepareResult.getResult().getResult() 6580 == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY 6581 ) { 6582 this.prepareFlushResult = prepareResult; 6583 if (LOG.isDebugEnabled()) { 6584 LOG.debug("{} prepared empty flush with seqId: {}", getRegionInfo(), 6585 flushDesc.getFlushSequenceNumber()); 6586 } 6587 } 6588 status.abort("Flush prepare failed with " + prepareResult.result); 6589 // nothing much to do. prepare flush failed because of some reason. 6590 } 6591 } finally { 6592 status.cleanup(); 6593 } 6594 break; 6595 case ABORT_FLUSH: 6596 // do nothing, an abort flush means the source region server will crash itself, after 6597 // the primary region online, it will send us an open region marker, then we can clean 6598 // up the memstore. 6599 synchronized (writestate) { 6600 writestate.flushing = false; 6601 } 6602 break; 6603 case COMMIT_FLUSH: 6604 case CANNOT_FLUSH: 6605 // just call refreshStoreFiles 6606 refreshStoreFiles(); 6607 logRegionFiles(); 6608 synchronized (writestate) { 6609 writestate.flushing = false; 6610 } 6611 break; 6612 default: 6613 LOG.warn("{} received a flush event with unknown action: {}", getRegionInfo(), 6614 TextFormat.shortDebugString(flushDesc)); 6615 } 6616 } else { 6617 // for all other region events, we will do a refreshStoreFiles 6618 refreshStoreFiles(); 6619 logRegionFiles(); 6620 } 6621 } finally { 6622 closeRegionOperation(Operation.REPLAY_EVENT); 6623 } 6624 } 6625 6626 /** 6627 * Replay remote wal entry sent by primary replica. 6628 * <p/> 6629 * Should only call this method on secondary replicas. 6630 */ 6631 void replayWALEntry(WALEntry entry, CellScanner cells) throws IOException { 6632 long timeout = -1L; 6633 Optional<RpcCall> call = RpcServer.getCurrentCall(); 6634 if (call.isPresent()) { 6635 long deadline = call.get().getDeadline(); 6636 if (deadline < Long.MAX_VALUE) { 6637 timeout = deadline - EnvironmentEdgeManager.currentTime(); 6638 if (timeout <= 0) { 6639 throw new TimeoutIOException("Timeout while replaying edits for " + getRegionInfo()); 6640 } 6641 } 6642 } 6643 if (timeout > 0) { 6644 try { 6645 if (!replayLock.tryLock(timeout, TimeUnit.MILLISECONDS)) { 6646 throw new TimeoutIOException( 6647 "Timeout while waiting for lock when replaying edits for " + getRegionInfo()); 6648 } 6649 } catch (InterruptedException e) { 6650 throw throwOnInterrupt(e); 6651 } 6652 } else { 6653 replayLock.lock(); 6654 } 6655 try { 6656 int count = entry.getAssociatedCellCount(); 6657 long sequenceId = entry.getKey().getLogSequenceNumber(); 6658 if (lastReplayedSequenceId >= sequenceId) { 6659 // we have already replayed this edit, skip 6660 // remember to advance the CellScanner, as we may have multiple WALEntries, we may still 6661 // need apply later WALEntries 6662 for (int i = 0; i < count; i++) { 6663 // Throw index out of bounds if our cell count is off 6664 if (!cells.advance()) { 6665 throw new ArrayIndexOutOfBoundsException("Expected=" + count + ", index=" + i); 6666 } 6667 } 6668 return; 6669 } 6670 Map<byte[], List<Cell>> family2Cells = new TreeMap<>(Bytes.BYTES_COMPARATOR); 6671 for (int i = 0; i < count; i++) { 6672 // Throw index out of bounds if our cell count is off 6673 if (!cells.advance()) { 6674 throw new ArrayIndexOutOfBoundsException("Expected=" + count + ", index=" + i); 6675 } 6676 Cell cell = cells.current(); 6677 if (WALEdit.isMetaEditFamily(cell)) { 6678 // If there is meta edit, i.e, we have done flush/compaction/open, then we need to apply 6679 // the previous cells first, and then replay the special meta edit. The meta edit is like 6680 // a barrier, We need to keep the order. For example, the flush marker will contain a 6681 // flush sequence number, which makes us possible to drop memstore content, but if we 6682 // apply some edits which have greater sequence id first, then we can not drop the 6683 // memstore content when replaying the flush marker, which is not good as we could run out 6684 // of memory. 6685 // And usually, a meta edit will have a special WALEntry for it, so this is just a safe 6686 // guard logic to make sure we do not break things in the worst case. 6687 if (!family2Cells.isEmpty()) { 6688 replayWALBatchMutate(family2Cells); 6689 family2Cells.clear(); 6690 } 6691 replayWALMetaEdit(cell); 6692 } else { 6693 family2Cells.computeIfAbsent(CellUtil.cloneFamily(cell), k -> new ArrayList<>()) 6694 .add(cell); 6695 } 6696 } 6697 // do not forget to apply the remaining cells 6698 if (!family2Cells.isEmpty()) { 6699 replayWALBatchMutate(family2Cells); 6700 } 6701 mvcc.advanceTo(sequenceId); 6702 lastReplayedSequenceId = sequenceId; 6703 } finally { 6704 replayLock.unlock(); 6705 } 6706 } 6707 6708 /** 6709 * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult 6710 */ 6711 private void dropPrepareFlushIfPossible() { 6712 if (writestate.flushing) { 6713 boolean canDrop = true; 6714 if (prepareFlushResult.storeFlushCtxs != null) { 6715 for (Entry<byte[], StoreFlushContext> entry : prepareFlushResult.storeFlushCtxs 6716 .entrySet()) { 6717 HStore store = getStore(entry.getKey()); 6718 if (store == null) { 6719 continue; 6720 } 6721 if (store.getSnapshotSize().getDataSize() > 0) { 6722 canDrop = false; 6723 break; 6724 } 6725 } 6726 } 6727 6728 // this means that all the stores in the region has finished flushing, but the WAL marker 6729 // may not have been written or we did not receive it yet. 6730 if (canDrop) { 6731 writestate.flushing = false; 6732 this.prepareFlushResult = null; 6733 } 6734 } 6735 } 6736 6737 @Override 6738 public boolean refreshStoreFiles() throws IOException { 6739 return refreshStoreFiles(false); 6740 } 6741 6742 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 6743 justification = "Notify is about post replay. Intentional") 6744 protected boolean refreshStoreFiles(boolean force) throws IOException { 6745 if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 6746 return false; // if primary nothing to do 6747 } 6748 6749 if (LOG.isDebugEnabled()) { 6750 LOG.debug(getRegionInfo().getEncodedName() + " : " 6751 + "Refreshing store files to see whether we can free up memstore"); 6752 } 6753 6754 long totalFreedDataSize = 0; 6755 6756 long smallestSeqIdInStores = Long.MAX_VALUE; 6757 6758 startRegionOperation(); // obtain region close lock 6759 try { 6760 Map<HStore, Long> map = new HashMap<>(); 6761 synchronized (writestate) { 6762 for (HStore store : stores.values()) { 6763 // TODO: some stores might see new data from flush, while others do not which 6764 // MIGHT break atomic edits across column families. 6765 long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L); 6766 6767 // refresh the store files. This is similar to observing a region open wal marker. 6768 store.refreshStoreFiles(); 6769 6770 long storeSeqId = store.getMaxSequenceId().orElse(0L); 6771 if (storeSeqId < smallestSeqIdInStores) { 6772 smallestSeqIdInStores = storeSeqId; 6773 } 6774 6775 // see whether we can drop the memstore or the snapshot 6776 if (storeSeqId > maxSeqIdBefore) { 6777 if (writestate.flushing) { 6778 // only drop memstore snapshots if they are smaller than last flush for the store 6779 if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) { 6780 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null 6781 ? null 6782 : this.prepareFlushResult.storeFlushCtxs 6783 .get(store.getColumnFamilyDescriptor().getName()); 6784 if (ctx != null) { 6785 MemStoreSize mss = store.getFlushableSize(); 6786 ctx.abort(); 6787 this.decrMemStoreSize(mss); 6788 this.prepareFlushResult.storeFlushCtxs 6789 .remove(store.getColumnFamilyDescriptor().getName()); 6790 totalFreedDataSize += mss.getDataSize(); 6791 } 6792 } 6793 } 6794 6795 map.put(store, storeSeqId); 6796 } 6797 } 6798 6799 // if all stores ended up dropping their snapshots, we can safely drop the 6800 // prepareFlushResult 6801 dropPrepareFlushIfPossible(); 6802 6803 // advance the mvcc read point so that the new flushed files are visible. 6804 // either greater than flush seq number or they were already picked up via flush. 6805 for (HStore s : stores.values()) { 6806 mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L)); 6807 } 6808 6809 // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely 6810 // skip all edits that are to be replayed in the future with that has a smaller seqId 6811 // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits 6812 // that we have picked the flush files for 6813 if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) { 6814 this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores; 6815 } 6816 } 6817 if (!map.isEmpty()) { 6818 for (Map.Entry<HStore, Long> entry : map.entrySet()) { 6819 // Drop the memstore contents if they are now smaller than the latest seen flushed file 6820 totalFreedDataSize += 6821 dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey()).getDataSize(); 6822 } 6823 } 6824 // C. Finally notify anyone waiting on memstore to clear: 6825 // e.g. checkResources(). 6826 synchronized (this) { 6827 notifyAll(); // FindBugs NN_NAKED_NOTIFY 6828 } 6829 return totalFreedDataSize > 0; 6830 } finally { 6831 closeRegionOperation(); 6832 } 6833 } 6834 6835 private void logRegionFiles() { 6836 if (LOG.isTraceEnabled()) { 6837 LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: "); 6838 stores.values().stream().filter(s -> s.getStorefiles() != null) 6839 .flatMap(s -> s.getStorefiles().stream()) 6840 .forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf)); 6841 } 6842 } 6843 6844 /** 6845 * Checks whether the given regionName is either equal to our region, or that the regionName is 6846 * the primary region to our corresponding range for the secondary replica. 6847 */ 6848 private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload) 6849 throws WrongRegionException { 6850 if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) { 6851 return; 6852 } 6853 6854 if ( 6855 !RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) 6856 && Bytes.equals(encodedRegionName, this.fs.getRegionInfoForFS().getEncodedNameAsBytes()) 6857 ) { 6858 return; 6859 } 6860 6861 throw new WrongRegionException( 6862 exceptionMsg + payload + " targetted for region " + Bytes.toStringBinary(encodedRegionName) 6863 + " does not match this region: " + this.getRegionInfo()); 6864 } 6865 6866 /** 6867 * Used by tests 6868 * @param s Store to add edit too. 6869 * @param cell Cell to add. 6870 */ 6871 protected void restoreEdit(HStore s, Cell cell, MemStoreSizing memstoreAccounting) { 6872 s.add(cell, memstoreAccounting); 6873 } 6874 6875 /** 6876 * make sure have been through lease recovery before get file status, so the file length can be 6877 * trusted. 6878 * @param p File to check. 6879 * @return True if file was zero-length (and if so, we'll delete it in here). 6880 */ 6881 private static boolean isZeroLengthThenDelete(final FileSystem fs, final FileStatus stat, 6882 final Path p) throws IOException { 6883 if (stat.getLen() > 0) { 6884 return false; 6885 } 6886 LOG.warn("File " + p + " is zero-length, deleting."); 6887 fs.delete(p, false); 6888 return true; 6889 } 6890 6891 protected HStore instantiateHStore(final ColumnFamilyDescriptor family, boolean warmup) 6892 throws IOException { 6893 if (family.isMobEnabled()) { 6894 if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) { 6895 throw new IOException("A minimum HFile version of " + HFile.MIN_FORMAT_VERSION_WITH_TAGS 6896 + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY 6897 + " accordingly."); 6898 } 6899 return new HMobStore(this, family, this.conf, warmup); 6900 } 6901 return new HStore(this, family, this.conf, warmup); 6902 } 6903 6904 @Override 6905 public HStore getStore(byte[] column) { 6906 return this.stores.get(column); 6907 } 6908 6909 /** 6910 * Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on 6911 * the list. 6912 */ 6913 private HStore getStore(Cell cell) { 6914 return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey())) 6915 .map(e -> e.getValue()).findFirst().orElse(null); 6916 } 6917 6918 @Override 6919 public List<HStore> getStores() { 6920 return new ArrayList<>(stores.values()); 6921 } 6922 6923 @Override 6924 public List<String> getStoreFileList(byte[][] columns) throws IllegalArgumentException { 6925 List<String> storeFileNames = new ArrayList<>(); 6926 synchronized (closeLock) { 6927 for (byte[] column : columns) { 6928 HStore store = this.stores.get(column); 6929 if (store == null) { 6930 throw new IllegalArgumentException( 6931 "No column family : " + new String(column, StandardCharsets.UTF_8) + " available"); 6932 } 6933 Collection<HStoreFile> storeFiles = store.getStorefiles(); 6934 if (storeFiles == null) { 6935 continue; 6936 } 6937 for (HStoreFile storeFile : storeFiles) { 6938 storeFileNames.add(storeFile.getPath().toString()); 6939 } 6940 6941 logRegionFiles(); 6942 } 6943 } 6944 return storeFileNames; 6945 } 6946 6947 ////////////////////////////////////////////////////////////////////////////// 6948 // Support code 6949 ////////////////////////////////////////////////////////////////////////////// 6950 6951 /** Make sure this is a valid row for the HRegion */ 6952 void checkRow(byte[] row, String op) throws IOException { 6953 if (!rowIsInRange(getRegionInfo(), row)) { 6954 throw new WrongRegionException("Requested row out of range for " + op + " on HRegion " + this 6955 + ", startKey='" + Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" 6956 + Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" + Bytes.toStringBinary(row) 6957 + "'"); 6958 } 6959 } 6960 6961 /** 6962 * Get an exclusive ( write lock ) lock on a given row. 6963 * @param row Which row to lock. 6964 * @return A locked RowLock. The lock is exclusive and already aqquired. 6965 */ 6966 public RowLock getRowLock(byte[] row) throws IOException { 6967 return getRowLock(row, false); 6968 } 6969 6970 @Override 6971 public RowLock getRowLock(byte[] row, boolean readLock) throws IOException { 6972 checkRow(row, "row lock"); 6973 return getRowLock(row, readLock, null); 6974 } 6975 6976 Span createRegionSpan(String name) { 6977 return TraceUtil.createSpan(name).setAttribute(REGION_NAMES_KEY, 6978 Collections.singletonList(getRegionInfo().getRegionNameAsString())); 6979 } 6980 6981 // will be override in tests 6982 protected RowLock getRowLockInternal(byte[] row, boolean readLock, RowLock prevRowLock) 6983 throws IOException { 6984 // create an object to use a a key in the row lock map 6985 HashedBytes rowKey = new HashedBytes(row); 6986 6987 RowLockContext rowLockContext = null; 6988 RowLockImpl result = null; 6989 6990 boolean success = false; 6991 try { 6992 // Keep trying until we have a lock or error out. 6993 // TODO: do we need to add a time component here? 6994 while (result == null) { 6995 rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey)); 6996 // Now try an get the lock. 6997 // This can fail as 6998 if (readLock) { 6999 // For read lock, if the caller has locked the same row previously, it will not try 7000 // to acquire the same read lock. It simply returns the previous row lock. 7001 RowLockImpl prevRowLockImpl = (RowLockImpl) prevRowLock; 7002 if ( 7003 (prevRowLockImpl != null) 7004 && (prevRowLockImpl.getLock() == rowLockContext.readWriteLock.readLock()) 7005 ) { 7006 success = true; 7007 return prevRowLock; 7008 } 7009 result = rowLockContext.newReadLock(); 7010 } else { 7011 result = rowLockContext.newWriteLock(); 7012 } 7013 } 7014 7015 int timeout = rowLockWaitDuration; 7016 boolean reachDeadlineFirst = false; 7017 Optional<RpcCall> call = RpcServer.getCurrentCall(); 7018 if (call.isPresent()) { 7019 long deadline = call.get().getDeadline(); 7020 if (deadline < Long.MAX_VALUE) { 7021 int timeToDeadline = (int) (deadline - EnvironmentEdgeManager.currentTime()); 7022 if (timeToDeadline <= this.rowLockWaitDuration) { 7023 reachDeadlineFirst = true; 7024 timeout = timeToDeadline; 7025 } 7026 } 7027 } 7028 7029 if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) { 7030 String message = "Timed out waiting for lock for row: " + rowKey + " in region " 7031 + getRegionInfo().getEncodedName(); 7032 if (reachDeadlineFirst) { 7033 throw new TimeoutIOException(message); 7034 } else { 7035 // If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request. 7036 throw new IOException(message); 7037 } 7038 } 7039 rowLockContext.setThreadName(Thread.currentThread().getName()); 7040 success = true; 7041 return result; 7042 } catch (InterruptedException ie) { 7043 if (LOG.isDebugEnabled()) { 7044 LOG.debug("Thread interrupted waiting for lock on row: {}, in region {}", rowKey, 7045 getRegionInfo().getRegionNameAsString()); 7046 } 7047 throw throwOnInterrupt(ie); 7048 } catch (Error error) { 7049 // The maximum lock count for read lock is 64K (hardcoded), when this maximum count 7050 // is reached, it will throw out an Error. This Error needs to be caught so it can 7051 // go ahead to process the minibatch with lock acquired. 7052 LOG.warn("Error to get row lock for {}, in region {}, cause: {}", Bytes.toStringBinary(row), 7053 getRegionInfo().getRegionNameAsString(), error); 7054 IOException ioe = new IOException(error); 7055 throw ioe; 7056 } finally { 7057 // Clean up the counts just in case this was the thing keeping the context alive. 7058 if (!success && rowLockContext != null) { 7059 rowLockContext.cleanUp(); 7060 } 7061 } 7062 } 7063 7064 private RowLock getRowLock(byte[] row, boolean readLock, final RowLock prevRowLock) 7065 throws IOException { 7066 return TraceUtil.trace(() -> getRowLockInternal(row, readLock, prevRowLock), 7067 () -> createRegionSpan("Region.getRowLock").setAttribute(ROW_LOCK_READ_LOCK_KEY, readLock)); 7068 } 7069 7070 private void releaseRowLocks(List<RowLock> rowLocks) { 7071 if (rowLocks != null) { 7072 for (RowLock rowLock : rowLocks) { 7073 rowLock.release(); 7074 } 7075 rowLocks.clear(); 7076 } 7077 } 7078 7079 public int getReadLockCount() { 7080 return lock.getReadLockCount(); 7081 } 7082 7083 public ConcurrentHashMap<HashedBytes, RowLockContext> getLockedRows() { 7084 return lockedRows; 7085 } 7086 7087 class RowLockContext { 7088 private final HashedBytes row; 7089 final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true); 7090 final AtomicBoolean usable = new AtomicBoolean(true); 7091 final AtomicInteger count = new AtomicInteger(0); 7092 final Object lock = new Object(); 7093 private String threadName; 7094 7095 RowLockContext(HashedBytes row) { 7096 this.row = row; 7097 } 7098 7099 RowLockImpl newWriteLock() { 7100 Lock l = readWriteLock.writeLock(); 7101 return getRowLock(l); 7102 } 7103 7104 RowLockImpl newReadLock() { 7105 Lock l = readWriteLock.readLock(); 7106 return getRowLock(l); 7107 } 7108 7109 private RowLockImpl getRowLock(Lock l) { 7110 count.incrementAndGet(); 7111 synchronized (lock) { 7112 if (usable.get()) { 7113 return new RowLockImpl(this, l); 7114 } else { 7115 return null; 7116 } 7117 } 7118 } 7119 7120 void cleanUp() { 7121 long c = count.decrementAndGet(); 7122 if (c <= 0) { 7123 synchronized (lock) { 7124 if (count.get() <= 0 && usable.get()) { // Don't attempt to remove row if already removed 7125 usable.set(false); 7126 RowLockContext removed = lockedRows.remove(row); 7127 assert removed == this : "we should never remove a different context"; 7128 } 7129 } 7130 } 7131 } 7132 7133 public void setThreadName(String threadName) { 7134 this.threadName = threadName; 7135 } 7136 7137 @Override 7138 public String toString() { 7139 return "RowLockContext{" + "row=" + row + ", readWriteLock=" + readWriteLock + ", count=" 7140 + count + ", threadName=" + threadName + '}'; 7141 } 7142 } 7143 7144 /** 7145 * Class used to represent a lock on a row. 7146 */ 7147 public static class RowLockImpl implements RowLock { 7148 private final RowLockContext context; 7149 private final Lock lock; 7150 7151 public RowLockImpl(RowLockContext context, Lock lock) { 7152 this.context = context; 7153 this.lock = lock; 7154 } 7155 7156 public Lock getLock() { 7157 return lock; 7158 } 7159 7160 public RowLockContext getContext() { 7161 return context; 7162 } 7163 7164 @Override 7165 public void release() { 7166 lock.unlock(); 7167 context.cleanUp(); 7168 } 7169 7170 @Override 7171 public String toString() { 7172 return "RowLockImpl{" + "context=" + context + ", lock=" + lock + '}'; 7173 } 7174 } 7175 7176 /** 7177 * Determines whether multiple column families are present Precondition: familyPaths is not null 7178 * @param familyPaths List of (column family, hfilePath) 7179 */ 7180 private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) { 7181 boolean multipleFamilies = false; 7182 byte[] family = null; 7183 for (Pair<byte[], String> pair : familyPaths) { 7184 byte[] fam = pair.getFirst(); 7185 if (family == null) { 7186 family = fam; 7187 } else if (!Bytes.equals(family, fam)) { 7188 multipleFamilies = true; 7189 break; 7190 } 7191 } 7192 return multipleFamilies; 7193 } 7194 7195 /** 7196 * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple 7197 * column families atomically. 7198 * @param familyPaths List of Pair<byte[] column family, String hfilePath> 7199 * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be 7200 * bulk loaded 7201 * @return Map from family to List of store file paths if successful, null if failed recoverably 7202 * @throws IOException if failed unrecoverably. 7203 */ 7204 public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, 7205 boolean assignSeqId, BulkLoadListener bulkLoadListener) throws IOException { 7206 return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false, null, true); 7207 } 7208 7209 /** 7210 * Listener class to enable callers of bulkLoadHFile() to perform any necessary pre/post 7211 * processing of a given bulkload call 7212 */ 7213 public interface BulkLoadListener { 7214 /** 7215 * Called before an HFile is actually loaded 7216 * @param family family being loaded to 7217 * @param srcPath path of HFile 7218 * @return final path to be used for actual loading 7219 */ 7220 String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile, String customStaging) 7221 throws IOException; 7222 7223 /** 7224 * Called after a successful HFile load 7225 * @param family family being loaded to 7226 * @param srcPath path of HFile 7227 */ 7228 void doneBulkLoad(byte[] family, String srcPath) throws IOException; 7229 7230 /** 7231 * Called after a failed HFile load 7232 * @param family family being loaded to 7233 * @param srcPath path of HFile 7234 */ 7235 void failedBulkLoad(byte[] family, String srcPath) throws IOException; 7236 } 7237 7238 /** 7239 * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple 7240 * column families atomically. 7241 * @param familyPaths List of Pair<byte[] column family, String hfilePath> 7242 * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be 7243 * bulk loaded 7244 * @param copyFile always copy hfiles if true 7245 * @param clusterIds ids from clusters that had already handled the given bulkload event. 7246 * @return Map from family to List of store file paths if successful, null if failed recoverably 7247 * @throws IOException if failed unrecoverably. 7248 */ 7249 public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, 7250 boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile, 7251 List<String> clusterIds, boolean replicate) throws IOException { 7252 long seqId = -1; 7253 Map<byte[], List<Path>> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); 7254 Map<String, Long> storeFilesSizes = new HashMap<>(); 7255 Preconditions.checkNotNull(familyPaths); 7256 // we need writeLock for multi-family bulk load 7257 startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths)); 7258 boolean isSuccessful = false; 7259 try { 7260 this.writeRequestsCount.increment(); 7261 7262 // There possibly was a split that happened between when the split keys 7263 // were gathered and before the HRegion's write lock was taken. We need 7264 // to validate the HFile region before attempting to bulk load all of them 7265 IOException ioException = null; 7266 List<Pair<byte[], String>> failures = new ArrayList<>(); 7267 for (Pair<byte[], String> p : familyPaths) { 7268 byte[] familyName = p.getFirst(); 7269 String path = p.getSecond(); 7270 7271 HStore store = getStore(familyName); 7272 if (store == null) { 7273 ioException = new org.apache.hadoop.hbase.DoNotRetryIOException( 7274 "No such column family " + Bytes.toStringBinary(familyName)); 7275 } else { 7276 try { 7277 store.assertBulkLoadHFileOk(new Path(path)); 7278 } catch (WrongRegionException wre) { 7279 // recoverable (file doesn't fit in region) 7280 failures.add(p); 7281 } catch (IOException ioe) { 7282 // unrecoverable (hdfs problem) 7283 ioException = ioe; 7284 } 7285 } 7286 7287 // validation failed because of some sort of IO problem. 7288 if (ioException != null) { 7289 LOG.error("There was IO error when checking if the bulk load is ok in region {}.", this, 7290 ioException); 7291 throw ioException; 7292 } 7293 } 7294 // validation failed, bail out before doing anything permanent. 7295 if (failures.size() != 0) { 7296 StringBuilder list = new StringBuilder(); 7297 for (Pair<byte[], String> p : failures) { 7298 list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ") 7299 .append(p.getSecond()); 7300 } 7301 // problem when validating 7302 LOG.warn("There was a recoverable bulk load failure likely due to a split. These (family," 7303 + " HFile) pairs were not loaded: {}, in region {}", list.toString(), this); 7304 return null; 7305 } 7306 7307 // We need to assign a sequential ID that's in between two memstores in order to preserve 7308 // the guarantee that all the edits lower than the highest sequential ID from all the 7309 // HFiles are flushed on disk. See HBASE-10958. The sequence id returned when we flush is 7310 // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is 7311 // a sequence id that we can be sure is beyond the last hfile written). 7312 if (assignSeqId) { 7313 FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY); 7314 if (fs.isFlushSucceeded()) { 7315 seqId = ((FlushResultImpl) fs).flushSequenceId; 7316 } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) { 7317 seqId = ((FlushResultImpl) fs).flushSequenceId; 7318 } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) { 7319 // CANNOT_FLUSH may mean that a flush is already on-going 7320 // we need to wait for that flush to complete 7321 waitForFlushes(); 7322 } else { 7323 throw new IOException("Could not bulk load with an assigned sequential ID because the " 7324 + "flush didn't run. Reason for not flushing: " + ((FlushResultImpl) fs).failureReason); 7325 } 7326 } 7327 7328 Map<byte[], List<Pair<Path, Path>>> familyWithFinalPath = 7329 new TreeMap<>(Bytes.BYTES_COMPARATOR); 7330 for (Pair<byte[], String> p : familyPaths) { 7331 byte[] familyName = p.getFirst(); 7332 String path = p.getSecond(); 7333 HStore store = getStore(familyName); 7334 if (!familyWithFinalPath.containsKey(familyName)) { 7335 familyWithFinalPath.put(familyName, new ArrayList<>()); 7336 } 7337 List<Pair<Path, Path>> lst = familyWithFinalPath.get(familyName); 7338 String finalPath = path; 7339 try { 7340 boolean reqTmp = store.storeEngine.requireWritingToTmpDirFirst(); 7341 if (bulkLoadListener != null) { 7342 finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile, 7343 reqTmp ? null : fs.getRegionDir().toString()); 7344 } 7345 Pair<Path, Path> pair = null; 7346 if (reqTmp || !StoreFileInfo.isHFile(finalPath)) { 7347 pair = store.preBulkLoadHFile(finalPath, seqId); 7348 } else { 7349 Path livePath = new Path(finalPath); 7350 pair = new Pair<>(livePath, livePath); 7351 } 7352 lst.add(pair); 7353 } catch (IOException ioe) { 7354 // A failure here can cause an atomicity violation that we currently 7355 // cannot recover from since it is likely a failed HDFS operation. 7356 7357 LOG.error("There was a partial failure due to IO when attempting to" + " load " 7358 + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe); 7359 if (bulkLoadListener != null) { 7360 try { 7361 bulkLoadListener.failedBulkLoad(familyName, finalPath); 7362 } catch (Exception ex) { 7363 LOG.error("Error while calling failedBulkLoad for family " 7364 + Bytes.toString(familyName) + " with path " + path, ex); 7365 } 7366 } 7367 throw ioe; 7368 } 7369 } 7370 7371 if (this.getCoprocessorHost() != null) { 7372 for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) { 7373 this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue()); 7374 } 7375 } 7376 for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) { 7377 byte[] familyName = entry.getKey(); 7378 for (Pair<Path, Path> p : entry.getValue()) { 7379 String path = p.getFirst().toString(); 7380 Path commitedStoreFile = p.getSecond(); 7381 HStore store = getStore(familyName); 7382 try { 7383 store.bulkLoadHFile(familyName, path, commitedStoreFile); 7384 // Note the size of the store file 7385 try { 7386 FileSystem fs = commitedStoreFile.getFileSystem(baseConf); 7387 storeFilesSizes.put(commitedStoreFile.getName(), 7388 fs.getFileStatus(commitedStoreFile).getLen()); 7389 } catch (IOException e) { 7390 LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e); 7391 storeFilesSizes.put(commitedStoreFile.getName(), 0L); 7392 } 7393 7394 if (storeFiles.containsKey(familyName)) { 7395 storeFiles.get(familyName).add(commitedStoreFile); 7396 } else { 7397 List<Path> storeFileNames = new ArrayList<>(); 7398 storeFileNames.add(commitedStoreFile); 7399 storeFiles.put(familyName, storeFileNames); 7400 } 7401 if (bulkLoadListener != null) { 7402 bulkLoadListener.doneBulkLoad(familyName, path); 7403 } 7404 } catch (IOException ioe) { 7405 // A failure here can cause an atomicity violation that we currently 7406 // cannot recover from since it is likely a failed HDFS operation. 7407 7408 // TODO Need a better story for reverting partial failures due to HDFS. 7409 LOG.error("There was a partial failure due to IO when attempting to" + " load " 7410 + Bytes.toString(familyName) + " : " + p.getSecond(), ioe); 7411 if (bulkLoadListener != null) { 7412 try { 7413 bulkLoadListener.failedBulkLoad(familyName, path); 7414 } catch (Exception ex) { 7415 LOG.error("Error while calling failedBulkLoad for family " 7416 + Bytes.toString(familyName) + " with path " + path, ex); 7417 } 7418 } 7419 throw ioe; 7420 } 7421 } 7422 } 7423 7424 isSuccessful = true; 7425 if (conf.getBoolean(COMPACTION_AFTER_BULKLOAD_ENABLE, false)) { 7426 // request compaction 7427 familyWithFinalPath.keySet().forEach(family -> { 7428 HStore store = getStore(family); 7429 try { 7430 if (this.rsServices != null && store.needsCompaction()) { 7431 this.rsServices.getCompactionRequestor().requestSystemCompaction(this, store, 7432 "bulkload hfiles request compaction", true); 7433 LOG.info("Request compaction for region {} family {} after bulk load", 7434 this.getRegionInfo().getEncodedName(), store.getColumnFamilyName()); 7435 } 7436 } catch (IOException e) { 7437 LOG.error("bulkload hfiles request compaction error ", e); 7438 } 7439 }); 7440 } 7441 } finally { 7442 if (wal != null && !storeFiles.isEmpty()) { 7443 // Write a bulk load event for hfiles that are loaded 7444 try { 7445 WALProtos.BulkLoadDescriptor loadDescriptor = 7446 ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(), 7447 UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()), 7448 storeFiles, storeFilesSizes, seqId, clusterIds, replicate); 7449 WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(), 7450 loadDescriptor, mvcc, regionReplicationSink.orElse(null)); 7451 } catch (IOException ioe) { 7452 if (this.rsServices != null) { 7453 // Have to abort region server because some hfiles has been loaded but we can't write 7454 // the event into WAL 7455 isSuccessful = false; 7456 this.rsServices.abort("Failed to write bulk load event into WAL.", ioe); 7457 } 7458 } 7459 } 7460 7461 closeBulkRegionOperation(); 7462 } 7463 return isSuccessful ? storeFiles : null; 7464 } 7465 7466 @Override 7467 public boolean equals(Object o) { 7468 return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(), 7469 ((HRegion) o).getRegionInfo().getRegionName()); 7470 } 7471 7472 @Override 7473 public int hashCode() { 7474 return Bytes.hashCode(getRegionInfo().getRegionName()); 7475 } 7476 7477 @Override 7478 public String toString() { 7479 return getRegionInfo().getRegionNameAsString(); 7480 } 7481 7482 // Utility methods 7483 /** 7484 * A utility method to create new instances of HRegion based on the {@link HConstants#REGION_IMPL} 7485 * configuration property. 7486 * @param tableDir qualified path of directory where region should be located, usually the table 7487 * directory. 7488 * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a 7489 * logfile from the previous execution that's custom-computed for this HRegion. 7490 * The HRegionServer computes and sorts the appropriate wal info for this 7491 * HRegion. If there is a previous file (implying that the HRegion has been 7492 * written-to before), then read it from the supplied path. 7493 * @param fs is the filesystem. 7494 * @param conf is global configuration settings. 7495 * @param regionInfo - RegionInfo that describes the region is new), then read them from the 7496 * supplied path. 7497 * @param htd the table descriptor 7498 * @return the new instance 7499 */ 7500 public static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration conf, 7501 RegionInfo regionInfo, final TableDescriptor htd, RegionServerServices rsServices) { 7502 try { 7503 @SuppressWarnings("unchecked") 7504 Class<? extends HRegion> regionClass = 7505 (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class); 7506 7507 Constructor<? extends HRegion> c = 7508 regionClass.getConstructor(Path.class, WAL.class, FileSystem.class, Configuration.class, 7509 RegionInfo.class, TableDescriptor.class, RegionServerServices.class); 7510 7511 return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices); 7512 } catch (Throwable e) { 7513 // todo: what should I throw here? 7514 throw new IllegalStateException("Could not instantiate a region instance.", e); 7515 } 7516 } 7517 7518 /** 7519 * Convenience method creating new HRegions. Used by createTable. 7520 * @param info Info for region to create. 7521 * @param rootDir Root directory for HBase instance 7522 * @param wal shared WAL 7523 * @param initialize - true to initialize the region 7524 * @return new HRegion 7525 */ 7526 public static HRegion createHRegion(final RegionInfo info, final Path rootDir, 7527 final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, 7528 final boolean initialize) throws IOException { 7529 return createHRegion(info, rootDir, conf, hTableDescriptor, wal, initialize, null); 7530 } 7531 7532 /** 7533 * Convenience method creating new HRegions. Used by createTable. 7534 * @param info Info for region to create. 7535 * @param rootDir Root directory for HBase instance 7536 * @param wal shared WAL 7537 * @param initialize - true to initialize the region 7538 * @param rsRpcServices An interface we can request flushes against. 7539 * @return new HRegion 7540 */ 7541 public static HRegion createHRegion(final RegionInfo info, final Path rootDir, 7542 final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, 7543 final boolean initialize, RegionServerServices rsRpcServices) throws IOException { 7544 LOG.info("creating " + info + ", tableDescriptor=" 7545 + (hTableDescriptor == null ? "null" : hTableDescriptor) + ", regionDir=" + rootDir); 7546 createRegionDir(conf, info, rootDir); 7547 FileSystem fs = rootDir.getFileSystem(conf); 7548 Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); 7549 HRegion region = 7550 HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, rsRpcServices); 7551 if (initialize) { 7552 region.initialize(null); 7553 } 7554 return region; 7555 } 7556 7557 /** 7558 * Create a region under the given table directory. 7559 */ 7560 public static HRegion createHRegion(Configuration conf, RegionInfo regionInfo, FileSystem fs, 7561 Path tableDir, TableDescriptor tableDesc) throws IOException { 7562 LOG.info("Creating {}, tableDescriptor={}, under table dir {}", regionInfo, tableDesc, 7563 tableDir); 7564 HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, regionInfo); 7565 HRegion region = HRegion.newHRegion(tableDir, null, fs, conf, regionInfo, tableDesc, null); 7566 return region; 7567 } 7568 7569 /** 7570 * Create the region directory in the filesystem. 7571 */ 7572 public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri, 7573 Path rootDir) throws IOException { 7574 FileSystem fs = rootDir.getFileSystem(configuration); 7575 Path tableDir = CommonFSUtils.getTableDir(rootDir, ri.getTable()); 7576 // If directory already exists, will log warning and keep going. Will try to create 7577 // .regioninfo. If one exists, will overwrite. 7578 return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri); 7579 } 7580 7581 public static HRegion createHRegion(final RegionInfo info, final Path rootDir, 7582 final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal) 7583 throws IOException { 7584 return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true); 7585 } 7586 7587 /** 7588 * Open a Region. 7589 * @param info Info for region to be opened. 7590 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing 7591 * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is 7592 * properly kept up. HRegionStore does this every time it opens a new region. 7593 * @return new HRegion 7594 */ 7595 public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, 7596 final Configuration conf) throws IOException { 7597 return openHRegion(info, htd, wal, conf, null, null); 7598 } 7599 7600 /** 7601 * Open a Region. 7602 * @param info Info for region to be opened 7603 * @param htd the table descriptor 7604 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7605 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7606 * wal id is properly kept up. HRegionStore does this every time it opens a new 7607 * region. 7608 * @param conf The Configuration object to use. 7609 * @param rsServices An interface we can request flushes against. 7610 * @param reporter An interface we can report progress against. 7611 * @return new HRegion 7612 */ 7613 public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, 7614 final Configuration conf, final RegionServerServices rsServices, 7615 final CancelableProgressable reporter) throws IOException { 7616 return openHRegion(CommonFSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter); 7617 } 7618 7619 /** 7620 * Open a Region. 7621 * @param rootDir Root directory for HBase instance 7622 * @param info Info for region to be opened. 7623 * @param htd the table descriptor 7624 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing 7625 * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is 7626 * properly kept up. HRegionStore does this every time it opens a new region. 7627 * @param conf The Configuration object to use. 7628 * @return new HRegion 7629 */ 7630 public static HRegion openHRegion(Path rootDir, final RegionInfo info, final TableDescriptor htd, 7631 final WAL wal, final Configuration conf) throws IOException { 7632 return openHRegion(rootDir, info, htd, wal, conf, null, null); 7633 } 7634 7635 /** 7636 * Open a Region. 7637 * @param rootDir Root directory for HBase instance 7638 * @param info Info for region to be opened. 7639 * @param htd the table descriptor 7640 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7641 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7642 * wal id is properly kept up. HRegionStore does this every time it opens a new 7643 * region. 7644 * @param conf The Configuration object to use. 7645 * @param rsServices An interface we can request flushes against. 7646 * @param reporter An interface we can report progress against. 7647 * @return new HRegion 7648 */ 7649 public static HRegion openHRegion(final Path rootDir, final RegionInfo info, 7650 final TableDescriptor htd, final WAL wal, final Configuration conf, 7651 final RegionServerServices rsServices, final CancelableProgressable reporter) 7652 throws IOException { 7653 FileSystem fs = null; 7654 if (rsServices != null) { 7655 fs = rsServices.getFileSystem(); 7656 } 7657 if (fs == null) { 7658 fs = rootDir.getFileSystem(conf); 7659 } 7660 return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter); 7661 } 7662 7663 /** 7664 * Open a Region. 7665 * @param conf The Configuration object to use. 7666 * @param fs Filesystem to use 7667 * @param rootDir Root directory for HBase instance 7668 * @param info Info for region to be opened. 7669 * @param htd the table descriptor 7670 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing 7671 * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is 7672 * properly kept up. HRegionStore does this every time it opens a new region. 7673 * @return new HRegion 7674 */ 7675 public static HRegion openHRegion(final Configuration conf, final FileSystem fs, 7676 final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal) 7677 throws IOException { 7678 return openHRegion(conf, fs, rootDir, info, htd, wal, null, null); 7679 } 7680 7681 /** 7682 * Open a Region. 7683 * @param conf The Configuration object to use. 7684 * @param fs Filesystem to use 7685 * @param rootDir Root directory for HBase instance 7686 * @param info Info for region to be opened. 7687 * @param htd the table descriptor 7688 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7689 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7690 * wal id is properly kept up. HRegionStore does this every time it opens a new 7691 * region. 7692 * @param rsServices An interface we can request flushes against. 7693 * @param reporter An interface we can report progress against. 7694 * @return new HRegion 7695 */ 7696 public static HRegion openHRegion(final Configuration conf, final FileSystem fs, 7697 final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, 7698 final RegionServerServices rsServices, final CancelableProgressable reporter) 7699 throws IOException { 7700 Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); 7701 return openHRegionFromTableDir(conf, fs, tableDir, info, htd, wal, rsServices, reporter); 7702 } 7703 7704 /** 7705 * Open a Region. 7706 * @param conf The Configuration object to use. 7707 * @param fs Filesystem to use 7708 * @param info Info for region to be opened. 7709 * @param htd the table descriptor 7710 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7711 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7712 * wal id is properly kept up. HRegionStore does this every time it opens a new 7713 * region. 7714 * @param rsServices An interface we can request flushes against. 7715 * @param reporter An interface we can report progress against. 7716 * @return new HRegion 7717 * @throws NullPointerException if {@code info} is {@code null} 7718 */ 7719 public static HRegion openHRegionFromTableDir(final Configuration conf, final FileSystem fs, 7720 final Path tableDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, 7721 final RegionServerServices rsServices, final CancelableProgressable reporter) 7722 throws IOException { 7723 Objects.requireNonNull(info, "RegionInfo cannot be null"); 7724 LOG.debug("Opening region: {}", info); 7725 HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices); 7726 return r.openHRegion(reporter); 7727 } 7728 7729 public NavigableMap<byte[], Integer> getReplicationScope() { 7730 return this.replicationScope; 7731 } 7732 7733 /** 7734 * Useful when reopening a closed region (normally for unit tests) 7735 * @param other original object 7736 * @param reporter An interface we can report progress against. 7737 * @return new HRegion 7738 */ 7739 public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter) 7740 throws IOException { 7741 HRegionFileSystem regionFs = other.getRegionFileSystem(); 7742 HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(), 7743 other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null); 7744 return r.openHRegion(reporter); 7745 } 7746 7747 public static Region openHRegion(final Region other, final CancelableProgressable reporter) 7748 throws IOException { 7749 return openHRegion((HRegion) other, reporter); 7750 } 7751 7752 /** 7753 * Open HRegion. 7754 * <p/> 7755 * Calls initialize and sets sequenceId. 7756 * @return Returns <code>this</code> 7757 */ 7758 private HRegion openHRegion(final CancelableProgressable reporter) throws IOException { 7759 try { 7760 CompoundConfiguration cConfig = 7761 new CompoundConfiguration().add(conf).addBytesMap(htableDescriptor.getValues()); 7762 // Refuse to open the region if we are missing local compression support 7763 TableDescriptorChecker.checkCompression(cConfig, htableDescriptor); 7764 // Refuse to open the region if encryption configuration is incorrect or 7765 // codec support is missing 7766 LOG.debug("checking encryption for " + this.getRegionInfo().getEncodedName()); 7767 TableDescriptorChecker.checkEncryption(cConfig, htableDescriptor); 7768 // Refuse to open the region if a required class cannot be loaded 7769 LOG.debug("checking classloading for " + this.getRegionInfo().getEncodedName()); 7770 TableDescriptorChecker.checkClassLoading(cConfig, htableDescriptor); 7771 this.openSeqNum = initialize(reporter); 7772 this.mvcc.advanceTo(openSeqNum); 7773 // The openSeqNum must be increased every time when a region is assigned, as we rely on it to 7774 // determine whether a region has been successfully reopened. So here we always write open 7775 // marker, even if the table is read only. 7776 if ( 7777 wal != null && getRegionServerServices() != null 7778 && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) 7779 ) { 7780 writeRegionOpenMarker(wal, openSeqNum); 7781 } 7782 } catch (Throwable t) { 7783 // By coprocessor path wrong region will open failed, 7784 // MetricsRegionWrapperImpl is already init and not close, 7785 // add region close when open failed 7786 try { 7787 // It is not required to write sequence id file when region open is failed. 7788 // Passing true to skip the sequence id file write. 7789 this.close(true); 7790 } catch (Throwable e) { 7791 LOG.warn("Open region: {} failed. Try close region but got exception ", 7792 this.getRegionInfo(), e); 7793 } 7794 throw t; 7795 } 7796 return this; 7797 } 7798 7799 /** 7800 * Open a Region on a read-only file-system (like hdfs snapshots) 7801 * @param conf The Configuration object to use. 7802 * @param fs Filesystem to use 7803 * @param info Info for region to be opened. 7804 * @param htd the table descriptor 7805 * @return new HRegion 7806 * @throws NullPointerException if {@code info} is {@code null} 7807 */ 7808 public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs, 7809 final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException { 7810 Objects.requireNonNull(info, "RegionInfo cannot be null"); 7811 if (LOG.isDebugEnabled()) { 7812 LOG.debug("Opening region (readOnly filesystem): " + info); 7813 } 7814 if (info.getReplicaId() <= 0) { 7815 info = RegionReplicaUtil.getRegionInfoForReplica(info, 1); 7816 } 7817 HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null); 7818 r.writestate.setReadOnly(true); 7819 return r.openHRegion(null); 7820 } 7821 7822 public static HRegion warmupHRegion(final RegionInfo info, final TableDescriptor htd, 7823 final WAL wal, final Configuration conf, final RegionServerServices rsServices, 7824 final CancelableProgressable reporter) throws IOException { 7825 7826 Objects.requireNonNull(info, "RegionInfo cannot be null"); 7827 LOG.debug("Warmup {}", info); 7828 Path rootDir = CommonFSUtils.getRootDir(conf); 7829 Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); 7830 FileSystem fs = null; 7831 if (rsServices != null) { 7832 fs = rsServices.getFileSystem(); 7833 } 7834 if (fs == null) { 7835 fs = rootDir.getFileSystem(conf); 7836 } 7837 HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null); 7838 r.initializeWarmup(reporter); 7839 r.close(); 7840 return r; 7841 } 7842 7843 /** 7844 * Computes the Path of the HRegion 7845 * @param tabledir qualified path for table 7846 * @param name ENCODED region name 7847 * @return Path of HRegion directory 7848 * @deprecated For tests only; to be removed. 7849 */ 7850 @Deprecated 7851 public static Path getRegionDir(final Path tabledir, final String name) { 7852 return new Path(tabledir, name); 7853 } 7854 7855 /** 7856 * Determines if the specified row is within the row range specified by the specified RegionInfo 7857 * @param info RegionInfo that specifies the row range 7858 * @param row row to be checked 7859 * @return true if the row is within the range specified by the RegionInfo 7860 */ 7861 public static boolean rowIsInRange(RegionInfo info, final byte[] row) { 7862 return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), row) <= 0)) 7863 && ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), row) > 0)); 7864 } 7865 7866 public static boolean rowIsInRange(RegionInfo info, final byte[] row, final int offset, 7867 final short length) { 7868 return ((info.getStartKey().length == 0) 7869 || (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length, row, offset, length) 7870 <= 0)) 7871 && ((info.getEndKey().length == 0) 7872 || (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) 7873 > 0)); 7874 } 7875 7876 @Override 7877 public Result get(final Get get) throws IOException { 7878 prepareGet(get); 7879 List<Cell> results = get(get, true); 7880 boolean stale = this.getRegionInfo().getReplicaId() != 0; 7881 return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale); 7882 } 7883 7884 void prepareGet(final Get get) throws IOException { 7885 checkRow(get.getRow(), "Get"); 7886 // Verify families are all valid 7887 if (get.hasFamilies()) { 7888 for (byte[] family : get.familySet()) { 7889 checkFamily(family); 7890 } 7891 } else { // Adding all families to scanner 7892 for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { 7893 get.addFamily(family); 7894 } 7895 } 7896 } 7897 7898 @Override 7899 public List<Cell> get(Get get, boolean withCoprocessor) throws IOException { 7900 return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE); 7901 } 7902 7903 private List<Cell> get(Get get, boolean withCoprocessor, long nonceGroup, long nonce) 7904 throws IOException { 7905 return TraceUtil.trace(() -> getInternal(get, withCoprocessor, nonceGroup, nonce), 7906 () -> createRegionSpan("Region.get")); 7907 } 7908 7909 private List<Cell> getInternal(Get get, boolean withCoprocessor, long nonceGroup, long nonce) 7910 throws IOException { 7911 List<Cell> results = new ArrayList<>(); 7912 7913 // pre-get CP hook 7914 if (withCoprocessor && (coprocessorHost != null)) { 7915 if (coprocessorHost.preGet(get, results)) { 7916 metricsUpdateForGet(); 7917 return results; 7918 } 7919 } 7920 Scan scan = new Scan(get); 7921 if (scan.getLoadColumnFamiliesOnDemandValue() == null) { 7922 scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault()); 7923 } 7924 try (RegionScanner scanner = getScanner(scan, null, nonceGroup, nonce)) { 7925 List<Cell> tmp = new ArrayList<>(); 7926 scanner.next(tmp); 7927 // Copy EC to heap, then close the scanner. 7928 // This can be an EXPENSIVE call. It may make an extra copy from offheap to onheap buffers. 7929 // See more details in HBASE-26036. 7930 for (Cell cell : tmp) { 7931 results.add(CellUtil.cloneIfNecessary(cell)); 7932 } 7933 } 7934 7935 // post-get CP hook 7936 if (withCoprocessor && (coprocessorHost != null)) { 7937 coprocessorHost.postGet(get, results); 7938 } 7939 7940 metricsUpdateForGet(); 7941 7942 return results; 7943 } 7944 7945 void metricsUpdateForGet() { 7946 if (this.metricsRegion != null) { 7947 this.metricsRegion.updateGet(); 7948 } 7949 if (this.rsServices != null && this.rsServices.getMetrics() != null) { 7950 rsServices.getMetrics().updateReadQueryMeter(this, 1); 7951 } 7952 7953 } 7954 7955 @Override 7956 public Result mutateRow(RowMutations rm) throws IOException { 7957 return mutateRow(rm, HConstants.NO_NONCE, HConstants.NO_NONCE); 7958 } 7959 7960 public Result mutateRow(RowMutations rm, long nonceGroup, long nonce) throws IOException { 7961 final List<Mutation> m = rm.getMutations(); 7962 OperationStatus[] statuses = batchMutate(m.toArray(new Mutation[0]), true, nonceGroup, nonce); 7963 7964 List<Result> results = new ArrayList<>(); 7965 for (OperationStatus status : statuses) { 7966 if (status.getResult() != null) { 7967 results.add(status.getResult()); 7968 } 7969 } 7970 7971 if (results.isEmpty()) { 7972 return null; 7973 } 7974 7975 // Merge the results of the Increment/Append operations 7976 List<Cell> cells = new ArrayList<>(); 7977 for (Result result : results) { 7978 if (result.rawCells() != null) { 7979 cells.addAll(Arrays.asList(result.rawCells())); 7980 } 7981 } 7982 return Result.create(cells); 7983 } 7984 7985 /** 7986 * Perform atomic (all or none) mutations within the region. 7987 * @param mutations The list of mutations to perform. <code>mutations</code> can contain 7988 * operations for multiple rows. Caller has to ensure that all rows are 7989 * contained in this region. 7990 * @param rowsToLock Rows to lock 7991 * @param nonceGroup Optional nonce group of the operation (client Id) 7992 * @param nonce Optional nonce of the operation (unique random id to ensure "more 7993 * idempotence") If multiple rows are locked care should be taken that 7994 * <code>rowsToLock</code> is sorted in order to avoid deadlocks. 7995 */ 7996 @Override 7997 public void mutateRowsWithLocks(Collection<Mutation> mutations, Collection<byte[]> rowsToLock, 7998 long nonceGroup, long nonce) throws IOException { 7999 batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]), 8000 true, nonceGroup, nonce) { 8001 @Override 8002 public MiniBatchOperationInProgress<Mutation> 8003 lockRowsAndBuildMiniBatch(List<RowLock> acquiredRowLocks) throws IOException { 8004 RowLock prevRowLock = null; 8005 for (byte[] row : rowsToLock) { 8006 try { 8007 RowLock rowLock = region.getRowLock(row, false, prevRowLock); // write lock 8008 if (rowLock != prevRowLock) { 8009 acquiredRowLocks.add(rowLock); 8010 prevRowLock = rowLock; 8011 } 8012 } catch (IOException ioe) { 8013 LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(row), this, 8014 ioe); 8015 throw ioe; 8016 } 8017 } 8018 return createMiniBatch(size(), size()); 8019 } 8020 }); 8021 } 8022 8023 /** Returns statistics about the current load of the region */ 8024 public ClientProtos.RegionLoadStats getLoadStatistics() { 8025 if (!regionStatsEnabled) { 8026 return null; 8027 } 8028 ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder(); 8029 stats.setMemStoreLoad((int) (Math.min(100, 8030 (this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize))); 8031 if (rsServices.getHeapMemoryManager() != null) { 8032 // the HeapMemoryManager uses -0.0 to signal a problem asking the JVM, 8033 // so we could just do the calculation below and we'll get a 0. 8034 // treating it as a special case analogous to no HMM instead so that it can be 8035 // programatically treated different from using <1% of heap. 8036 final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent(); 8037 if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) { 8038 stats.setHeapOccupancy((int) (occupancy * 100)); 8039 } 8040 } 8041 stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100 8042 ? 100 8043 : rsServices.getCompactionPressure() * 100)); 8044 return stats.build(); 8045 } 8046 8047 @Override 8048 public Result append(Append append) throws IOException { 8049 return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE); 8050 } 8051 8052 public Result append(Append append, long nonceGroup, long nonce) throws IOException { 8053 return TraceUtil.trace(() -> { 8054 checkReadOnly(); 8055 checkResources(); 8056 startRegionOperation(Operation.APPEND); 8057 try { 8058 // All edits for the given row (across all column families) must happen atomically. 8059 return mutate(append, true, nonceGroup, nonce).getResult(); 8060 } finally { 8061 closeRegionOperation(Operation.APPEND); 8062 } 8063 }, () -> createRegionSpan("Region.append")); 8064 } 8065 8066 @Override 8067 public Result increment(Increment increment) throws IOException { 8068 return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE); 8069 } 8070 8071 public Result increment(Increment increment, long nonceGroup, long nonce) throws IOException { 8072 return TraceUtil.trace(() -> { 8073 checkReadOnly(); 8074 checkResources(); 8075 startRegionOperation(Operation.INCREMENT); 8076 try { 8077 // All edits for the given row (across all column families) must happen atomically. 8078 return mutate(increment, true, nonceGroup, nonce).getResult(); 8079 } finally { 8080 closeRegionOperation(Operation.INCREMENT); 8081 } 8082 }, () -> createRegionSpan("Region.increment")); 8083 } 8084 8085 private WALKeyImpl createWALKeyForWALAppend(boolean isReplay, BatchOperation<?> batchOp, long now, 8086 long nonceGroup, long nonce) { 8087 WALKeyImpl walKey = isReplay 8088 ? new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), 8089 this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, 8090 batchOp.getClusterIds(), nonceGroup, nonce, mvcc) 8091 : new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), 8092 this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, 8093 batchOp.getClusterIds(), nonceGroup, nonce, mvcc, this.getReplicationScope()); 8094 if (isReplay) { 8095 walKey.setOrigLogSeqNum(batchOp.getOrigLogSeqNum()); 8096 } 8097 return walKey; 8098 } 8099 8100 /** Returns writeEntry associated with this append */ 8101 private WriteEntry doWALAppend(WALEdit walEdit, BatchOperation<?> batchOp, 8102 MiniBatchOperationInProgress<Mutation> miniBatchOp, long now, NonceKey nonceKey) 8103 throws IOException { 8104 Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(), "WALEdit is null or empty!"); 8105 Preconditions.checkArgument( 8106 !walEdit.isReplay() || batchOp.getOrigLogSeqNum() != SequenceId.NO_SEQUENCE_ID, 8107 "Invalid replay sequence Id for replay WALEdit!"); 8108 8109 WALKeyImpl walKey = createWALKeyForWALAppend(walEdit.isReplay(), batchOp, now, 8110 nonceKey.getNonceGroup(), nonceKey.getNonce()); 8111 // don't call the coproc hook for writes to the WAL caused by 8112 // system lifecycle events like flushes or compactions 8113 if (this.coprocessorHost != null && !walEdit.isMetaEdit()) { 8114 this.coprocessorHost.preWALAppend(walKey, walEdit); 8115 } 8116 try { 8117 long txid = this.wal.appendData(this.getRegionInfo(), walKey, walEdit); 8118 WriteEntry writeEntry = walKey.getWriteEntry(); 8119 // Call sync on our edit. 8120 if (txid != 0) { 8121 sync(txid, batchOp.durability); 8122 } 8123 /** 8124 * If above {@link HRegion#sync} throws Exception, the RegionServer should be aborted and 8125 * following {@link BatchOperation#writeMiniBatchOperationsToMemStore} will not be executed, 8126 * so there is no need to replicate to secondary replica, for this reason here we attach the 8127 * region replication action after the {@link HRegion#sync} is successful. 8128 */ 8129 this.attachRegionReplicationInWALAppend(batchOp, miniBatchOp, walKey, walEdit, writeEntry); 8130 return writeEntry; 8131 } catch (IOException ioe) { 8132 if (walKey.getWriteEntry() != null) { 8133 mvcc.complete(walKey.getWriteEntry()); 8134 } 8135 8136 /** 8137 * If {@link WAL#sync} get a timeout exception, the only correct way is to abort the region 8138 * server, as the design of {@link WAL#sync}, is to succeed or die, there is no 'failure'. It 8139 * is usually not a big deal is because we set a very large default value(5 minutes) for 8140 * {@link AbstractFSWAL#WAL_SYNC_TIMEOUT_MS}, usually the WAL system will abort the region 8141 * server if it can not finish the sync within 5 minutes. 8142 */ 8143 if (ioe instanceof WALSyncTimeoutIOException) { 8144 if (rsServices != null) { 8145 rsServices.abort("WAL sync timeout,forcing server shutdown", ioe); 8146 } 8147 } 8148 throw ioe; 8149 } 8150 } 8151 8152 /** 8153 * Attach {@link RegionReplicationSink#add} to the mvcc writeEntry for replicating to region 8154 * replica. 8155 */ 8156 private void attachRegionReplicationInWALAppend(BatchOperation<?> batchOp, 8157 MiniBatchOperationInProgress<Mutation> miniBatchOp, WALKeyImpl walKey, WALEdit walEdit, 8158 WriteEntry writeEntry) { 8159 if (!regionReplicationSink.isPresent()) { 8160 return; 8161 } 8162 /** 8163 * If {@link HRegion#regionReplicationSink} is present,only {@link MutationBatchOperation} is 8164 * used and {@link NonceKey} is all the same for {@link Mutation}s in 8165 * {@link MutationBatchOperation},so for HBASE-26993 case 1,if 8166 * {@link MiniBatchOperationInProgress#getWalEditForReplicateSkipWAL} is not null and we could 8167 * enter {@link HRegion#doWALAppend},that means partial {@link Mutation}s are 8168 * {@link Durability#SKIP_WAL}, we use 8169 * {@link MiniBatchOperationInProgress#getWalEditForReplicateSkipWAL} to replicate to region 8170 * replica,but if {@link MiniBatchOperationInProgress#getWalEditForReplicateSkipWAL} is 8171 * null,that means there is no {@link Mutation} is {@link Durability#SKIP_WAL},so we just use 8172 * walEdit to replicate. 8173 */ 8174 assert batchOp instanceof MutationBatchOperation; 8175 WALEdit walEditToUse = miniBatchOp.getWalEditForReplicateIfExistsSkipWAL(); 8176 if (walEditToUse == null) { 8177 walEditToUse = walEdit; 8178 } 8179 doAttachReplicateRegionReplicaAction(walKey, walEditToUse, writeEntry); 8180 } 8181 8182 /** 8183 * Attach {@link RegionReplicationSink#add} to the mvcc writeEntry for replicating to region 8184 * replica. 8185 */ 8186 private void doAttachReplicateRegionReplicaAction(WALKeyImpl walKey, WALEdit walEdit, 8187 WriteEntry writeEntry) { 8188 if (walEdit == null || walEdit.isEmpty()) { 8189 return; 8190 } 8191 final ServerCall<?> rpcCall = RpcServer.getCurrentServerCallWithCellScanner().orElse(null); 8192 regionReplicationSink.ifPresent(sink -> writeEntry.attachCompletionAction(() -> { 8193 sink.add(walKey, walEdit, rpcCall); 8194 })); 8195 } 8196 8197 public static final long FIXED_OVERHEAD = ClassSize.estimateBase(HRegion.class, false); 8198 8199 // woefully out of date - currently missing: 8200 // 1 x HashMap - coprocessorServiceHandlers 8201 // 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL, 8202 // checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount, 8203 // writeRequestsCount, cpRequestsCount 8204 // 1 x HRegion$WriteState - writestate 8205 // 1 x RegionCoprocessorHost - coprocessorHost 8206 // 1 x RegionSplitPolicy - splitPolicy 8207 // 1 x MetricsRegion - metricsRegion 8208 // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper 8209 // 1 x ReadPointCalculationLock - smallestReadPointCalcLock 8210 public static final long DEEP_OVERHEAD = FIXED_OVERHEAD + ClassSize.OBJECT + // closeLock 8211 (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing 8212 (3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL, 8213 // compactionsFailed 8214 (3 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, scannerReadPoints, regionLockHolders 8215 WriteState.HEAP_SIZE + // writestate 8216 ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores 8217 (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock 8218 MultiVersionConcurrencyControl.FIXED_SIZE // mvcc 8219 + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes 8220 + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress 8221 + ClassSize.STORE_SERVICES // store services 8222 + StoreHotnessProtector.FIXED_SIZE; 8223 8224 @Override 8225 public long heapSize() { 8226 // this does not take into account row locks, recent flushes, mvcc entries, and more 8227 return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum(); 8228 } 8229 8230 /** 8231 * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to be 8232 * available for handling {@link #execService(RpcController, CoprocessorServiceCall)} calls. 8233 * <p/> 8234 * Only a single instance may be registered per region for a given {@link Service} subclass (the 8235 * instances are keyed on {@link ServiceDescriptor#getFullName()}.. After the first registration, 8236 * subsequent calls with the same service name will fail with a return value of {@code false}. 8237 * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint 8238 * @return {@code true} if the registration was successful, {@code false} otherwise 8239 */ 8240 public boolean registerService(Service instance) { 8241 // No stacking of instances is allowed for a single service name 8242 ServiceDescriptor serviceDesc = instance.getDescriptorForType(); 8243 String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc); 8244 if (coprocessorServiceHandlers.containsKey(serviceName)) { 8245 LOG.error("Coprocessor service {} already registered, rejecting request from {} in region {}", 8246 serviceName, instance, this); 8247 return false; 8248 } 8249 8250 coprocessorServiceHandlers.put(serviceName, instance); 8251 if (LOG.isDebugEnabled()) { 8252 LOG.debug("Registered coprocessor service: region=" 8253 + Bytes.toStringBinary(getRegionInfo().getRegionName()) + " service=" + serviceName); 8254 } 8255 return true; 8256 } 8257 8258 /** 8259 * Executes a single protocol buffer coprocessor endpoint {@link Service} method using the 8260 * registered protocol handlers. {@link Service} implementations must be registered via the 8261 * {@link #registerService(Service)} method before they are available. 8262 * @param controller an {@code RpcContoller} implementation to pass to the invoked service 8263 * @param call a {@code CoprocessorServiceCall} instance identifying the service, method, 8264 * and parameters for the method invocation 8265 * @return a protocol buffer {@code Message} instance containing the method's result 8266 * @throws IOException if no registered service handler is found or an error occurs during the 8267 * invocation 8268 * @see #registerService(Service) 8269 */ 8270 public Message execService(RpcController controller, CoprocessorServiceCall call) 8271 throws IOException { 8272 String serviceName = call.getServiceName(); 8273 Service service = coprocessorServiceHandlers.get(serviceName); 8274 if (service == null) { 8275 throw new UnknownProtocolException(null, "No registered coprocessor service found for " 8276 + serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName())); 8277 } 8278 ServiceDescriptor serviceDesc = service.getDescriptorForType(); 8279 8280 cpRequestsCount.increment(); 8281 String methodName = call.getMethodName(); 8282 MethodDescriptor methodDesc = CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc); 8283 8284 Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType(); 8285 8286 ProtobufUtil.mergeFrom(builder, call.getRequest().toByteArray()); 8287 Message request = CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest()); 8288 8289 if (coprocessorHost != null) { 8290 request = coprocessorHost.preEndpointInvocation(service, methodName, request); 8291 } 8292 8293 final Message.Builder responseBuilder = 8294 service.getResponsePrototype(methodDesc).newBuilderForType(); 8295 service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() { 8296 @Override 8297 public void run(Message message) { 8298 if (message != null) { 8299 responseBuilder.mergeFrom(message); 8300 } 8301 } 8302 }); 8303 8304 if (coprocessorHost != null) { 8305 coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder); 8306 } 8307 IOException exception = 8308 org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller); 8309 if (exception != null) { 8310 throw exception; 8311 } 8312 8313 return responseBuilder.build(); 8314 } 8315 8316 public Optional<byte[]> checkSplit() { 8317 return checkSplit(false); 8318 } 8319 8320 /** 8321 * Return the split point. An empty result indicates the region isn't splittable. 8322 */ 8323 public Optional<byte[]> checkSplit(boolean force) { 8324 // Can't split META 8325 if (this.getRegionInfo().isMetaRegion()) { 8326 return Optional.empty(); 8327 } 8328 8329 // Can't split a region that is closing. 8330 if (this.isClosing()) { 8331 return Optional.empty(); 8332 } 8333 8334 if (!force && !splitPolicy.shouldSplit()) { 8335 return Optional.empty(); 8336 } 8337 8338 byte[] ret = splitPolicy.getSplitPoint(); 8339 if (ret != null && ret.length > 0) { 8340 ret = splitRestriction.getRestrictedSplitPoint(ret); 8341 } 8342 8343 if (ret != null) { 8344 try { 8345 checkRow(ret, "calculated split"); 8346 } catch (IOException e) { 8347 LOG.error("Ignoring invalid split for region {}", this, e); 8348 return Optional.empty(); 8349 } 8350 return Optional.of(ret); 8351 } else { 8352 return Optional.empty(); 8353 } 8354 } 8355 8356 /** Returns The priority that this region should have in the compaction queue */ 8357 public int getCompactPriority() { 8358 if (checkSplit().isPresent() && conf.getBoolean(SPLIT_IGNORE_BLOCKING_ENABLED_KEY, false)) { 8359 // if a region should split, split it before compact 8360 return Store.PRIORITY_USER; 8361 } 8362 return stores.values().stream().mapToInt(HStore::getCompactPriority).min() 8363 .orElse(Store.NO_PRIORITY); 8364 } 8365 8366 /** Returns the coprocessor host */ 8367 public RegionCoprocessorHost getCoprocessorHost() { 8368 return coprocessorHost; 8369 } 8370 8371 /** @param coprocessorHost the new coprocessor host */ 8372 public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) { 8373 this.coprocessorHost = coprocessorHost; 8374 } 8375 8376 @Override 8377 public void startRegionOperation() throws IOException { 8378 startRegionOperation(Operation.ANY); 8379 } 8380 8381 @Override 8382 public void startRegionOperation(Operation op) throws IOException { 8383 boolean isInterruptableOp = false; 8384 switch (op) { 8385 case GET: // interruptible read operations 8386 case SCAN: 8387 isInterruptableOp = true; 8388 checkReadsEnabled(); 8389 break; 8390 case INCREMENT: // interruptible write operations 8391 case APPEND: 8392 case PUT: 8393 case DELETE: 8394 case BATCH_MUTATE: 8395 case CHECK_AND_MUTATE: 8396 isInterruptableOp = true; 8397 break; 8398 default: // all others 8399 break; 8400 } 8401 if ( 8402 op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION || op == Operation.COMPACT_REGION 8403 || op == Operation.COMPACT_SWITCH 8404 ) { 8405 // split, merge or compact region doesn't need to check the closing/closed state or lock the 8406 // region 8407 return; 8408 } 8409 if (this.closing.get()) { 8410 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); 8411 } 8412 lock(lock.readLock()); 8413 // Update regionLockHolders ONLY for any startRegionOperation call that is invoked from 8414 // an RPC handler 8415 Thread thisThread = Thread.currentThread(); 8416 if (isInterruptableOp) { 8417 regionLockHolders.put(thisThread, true); 8418 } 8419 if (this.closed.get()) { 8420 lock.readLock().unlock(); 8421 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); 8422 } 8423 // The unit for snapshot is a region. So, all stores for this region must be 8424 // prepared for snapshot operation before proceeding. 8425 if (op == Operation.SNAPSHOT) { 8426 stores.values().forEach(HStore::preSnapshotOperation); 8427 } 8428 try { 8429 if (coprocessorHost != null) { 8430 coprocessorHost.postStartRegionOperation(op); 8431 } 8432 } catch (Exception e) { 8433 if (isInterruptableOp) { 8434 // would be harmless to remove what we didn't add but we know by 'isInterruptableOp' 8435 // if we added this thread to regionLockHolders 8436 regionLockHolders.remove(thisThread); 8437 } 8438 lock.readLock().unlock(); 8439 throw new IOException(e); 8440 } 8441 } 8442 8443 @Override 8444 public void closeRegionOperation() throws IOException { 8445 closeRegionOperation(Operation.ANY); 8446 } 8447 8448 @Override 8449 public void closeRegionOperation(Operation operation) throws IOException { 8450 if (operation == Operation.SNAPSHOT) { 8451 stores.values().forEach(HStore::postSnapshotOperation); 8452 } 8453 Thread thisThread = Thread.currentThread(); 8454 regionLockHolders.remove(thisThread); 8455 lock.readLock().unlock(); 8456 if (coprocessorHost != null) { 8457 coprocessorHost.postCloseRegionOperation(operation); 8458 } 8459 } 8460 8461 /** 8462 * This method needs to be called before any public call that reads or modifies stores in bulk. It 8463 * has to be called just before a try. #closeBulkRegionOperation needs to be called in the try's 8464 * finally block Acquires a writelock and checks if the region is closing or closed. 8465 * @throws NotServingRegionException when the region is closing or closed 8466 * @throws RegionTooBusyException if failed to get the lock in time 8467 * @throws InterruptedIOException if interrupted while waiting for a lock 8468 */ 8469 private void startBulkRegionOperation(boolean writeLockNeeded) throws IOException { 8470 if (this.closing.get()) { 8471 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); 8472 } 8473 if (writeLockNeeded) lock(lock.writeLock()); 8474 else lock(lock.readLock()); 8475 if (this.closed.get()) { 8476 if (writeLockNeeded) lock.writeLock().unlock(); 8477 else lock.readLock().unlock(); 8478 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); 8479 } 8480 regionLockHolders.put(Thread.currentThread(), true); 8481 } 8482 8483 /** 8484 * Closes the lock. This needs to be called in the finally block corresponding to the try block of 8485 * #startRegionOperation 8486 */ 8487 private void closeBulkRegionOperation() { 8488 regionLockHolders.remove(Thread.currentThread()); 8489 if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock(); 8490 else lock.readLock().unlock(); 8491 } 8492 8493 /** 8494 * Update LongAdders for number of puts without wal and the size of possible data loss. These 8495 * information are exposed by the region server metrics. 8496 */ 8497 private void recordMutationWithoutWal(final Map<byte[], List<Cell>> familyMap) { 8498 numMutationsWithoutWAL.increment(); 8499 if (numMutationsWithoutWAL.sum() <= 1) { 8500 LOG.info("writing data to region " + this 8501 + " with WAL disabled. Data may be lost in the event of a crash."); 8502 } 8503 8504 long mutationSize = 0; 8505 for (List<Cell> cells : familyMap.values()) { 8506 // Optimization: 'foreach' loop is not used. See: 8507 // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects 8508 assert cells instanceof RandomAccess; 8509 int listSize = cells.size(); 8510 for (int i = 0; i < listSize; i++) { 8511 Cell cell = cells.get(i); 8512 mutationSize += cell.getSerializedSize(); 8513 } 8514 } 8515 8516 dataInMemoryWithoutWAL.add(mutationSize); 8517 } 8518 8519 private void lock(final Lock lock) throws IOException { 8520 lock(lock, 1); 8521 } 8522 8523 /** 8524 * Try to acquire a lock. Throw RegionTooBusyException if failed to get the lock in time. Throw 8525 * InterruptedIOException if interrupted while waiting for the lock. 8526 */ 8527 private void lock(final Lock lock, final int multiplier) throws IOException { 8528 try { 8529 final long waitTime = Math.min(maxBusyWaitDuration, 8530 busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier)); 8531 if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) { 8532 // Don't print millis. Message is used as a key over in 8533 // RetriesExhaustedWithDetailsException processing. 8534 final String regionName = 8535 this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getRegionNameAsString(); 8536 final String serverName = this.getRegionServerServices() == null 8537 ? "unknown" 8538 : (this.getRegionServerServices().getServerName() == null 8539 ? "unknown" 8540 : this.getRegionServerServices().getServerName().toString()); 8541 RegionTooBusyException rtbe = new RegionTooBusyException( 8542 "Failed to obtain lock; regionName=" + regionName + ", server=" + serverName); 8543 LOG.warn("Region is too busy to allow lock acquisition.", rtbe); 8544 throw rtbe; 8545 } 8546 } catch (InterruptedException ie) { 8547 if (LOG.isDebugEnabled()) { 8548 LOG.debug("Interrupted while waiting for a lock in region {}", this); 8549 } 8550 throw throwOnInterrupt(ie); 8551 } 8552 } 8553 8554 /** 8555 * Calls sync with the given transaction ID 8556 * @param txid should sync up to which transaction 8557 * @throws IOException If anything goes wrong with DFS 8558 */ 8559 private void sync(long txid, Durability durability) throws IOException { 8560 if (this.getRegionInfo().isMetaRegion()) { 8561 this.wal.sync(txid); 8562 } else { 8563 switch (durability) { 8564 case USE_DEFAULT: 8565 // do what table defaults to 8566 if (shouldSyncWAL()) { 8567 this.wal.sync(txid); 8568 } 8569 break; 8570 case SKIP_WAL: 8571 // nothing do to 8572 break; 8573 case ASYNC_WAL: 8574 // nothing do to 8575 break; 8576 case SYNC_WAL: 8577 this.wal.sync(txid, false); 8578 break; 8579 case FSYNC_WAL: 8580 this.wal.sync(txid, true); 8581 break; 8582 default: 8583 throw new RuntimeException("Unknown durability " + durability); 8584 } 8585 } 8586 } 8587 8588 /** 8589 * Check whether we should sync the wal from the table's durability settings 8590 */ 8591 private boolean shouldSyncWAL() { 8592 return regionDurability.ordinal() > Durability.ASYNC_WAL.ordinal(); 8593 } 8594 8595 /** Returns the latest sequence number that was read from storage when this region was opened */ 8596 public long getOpenSeqNum() { 8597 return this.openSeqNum; 8598 } 8599 8600 @Override 8601 public Map<byte[], Long> getMaxStoreSeqId() { 8602 return this.maxSeqIdInStores; 8603 } 8604 8605 public long getOldestSeqIdOfStore(byte[] familyName) { 8606 return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName); 8607 } 8608 8609 @Override 8610 public CompactionState getCompactionState() { 8611 boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0; 8612 return (hasMajor 8613 ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR) 8614 : (hasMinor ? CompactionState.MINOR : CompactionState.NONE)); 8615 } 8616 8617 public void reportCompactionRequestStart(boolean isMajor) { 8618 (isMajor ? majorInProgress : minorInProgress).incrementAndGet(); 8619 } 8620 8621 public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) { 8622 int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet(); 8623 8624 // metrics 8625 compactionsFinished.increment(); 8626 compactionNumFilesCompacted.add(numFiles); 8627 compactionNumBytesCompacted.add(filesSizeCompacted); 8628 8629 assert newValue >= 0; 8630 } 8631 8632 public void reportCompactionRequestFailure() { 8633 compactionsFailed.increment(); 8634 } 8635 8636 public void incrementCompactionsQueuedCount() { 8637 compactionsQueued.increment(); 8638 } 8639 8640 public void decrementCompactionsQueuedCount() { 8641 compactionsQueued.decrement(); 8642 } 8643 8644 public void incrementFlushesQueuedCount() { 8645 flushesQueued.increment(); 8646 } 8647 8648 protected void decrementFlushesQueuedCount() { 8649 flushesQueued.decrement(); 8650 } 8651 8652 /** 8653 * If a handler thread is eligible for interrupt, make it ineligible. Should be paired with 8654 * {{@link #enableInterrupts()}. 8655 */ 8656 void disableInterrupts() { 8657 regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> false); 8658 } 8659 8660 /** 8661 * If a handler thread was made ineligible for interrupt via {{@link #disableInterrupts()}, make 8662 * it eligible again. No-op if interrupts are already enabled. 8663 */ 8664 void enableInterrupts() { 8665 regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> true); 8666 } 8667 8668 /** 8669 * Interrupt any region options that have acquired the region lock via 8670 * {@link #startRegionOperation(org.apache.hadoop.hbase.regionserver.Region.Operation)}, or 8671 * {@link #startBulkRegionOperation(boolean)}. 8672 */ 8673 private void interruptRegionOperations() { 8674 for (Map.Entry<Thread, Boolean> entry : regionLockHolders.entrySet()) { 8675 // An entry in this map will have a boolean value indicating if it is currently 8676 // eligible for interrupt; if so, we should interrupt it. 8677 if (entry.getValue().booleanValue()) { 8678 entry.getKey().interrupt(); 8679 } 8680 } 8681 } 8682 8683 /** 8684 * Check thread interrupt status and throw an exception if interrupted. 8685 * @throws NotServingRegionException if region is closing 8686 * @throws InterruptedIOException if interrupted but region is not closing 8687 */ 8688 // Package scope for tests 8689 void checkInterrupt() throws NotServingRegionException, InterruptedIOException { 8690 if (Thread.interrupted()) { 8691 if (this.closing.get()) { 8692 throw new NotServingRegionException( 8693 getRegionInfo().getRegionNameAsString() + " is closing"); 8694 } 8695 throw new InterruptedIOException(); 8696 } 8697 } 8698 8699 /** 8700 * Throw the correct exception upon interrupt 8701 * @param t cause 8702 */ 8703 // Package scope for tests 8704 IOException throwOnInterrupt(Throwable t) { 8705 if (this.closing.get()) { 8706 return (NotServingRegionException) new NotServingRegionException( 8707 getRegionInfo().getRegionNameAsString() + " is closing").initCause(t); 8708 } 8709 return (InterruptedIOException) new InterruptedIOException().initCause(t); 8710 } 8711 8712 /** 8713 * {@inheritDoc} 8714 */ 8715 @Override 8716 public void onConfigurationChange(Configuration conf) { 8717 this.storeHotnessProtector.update(conf); 8718 // update coprocessorHost if the configuration has changed. 8719 if ( 8720 CoprocessorConfigurationUtil.checkConfigurationChange(getReadOnlyConfiguration(), conf, 8721 CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, 8722 CoprocessorHost.USER_REGION_COPROCESSOR_CONF_KEY) 8723 ) { 8724 LOG.info("Update the system coprocessors because the configuration has changed"); 8725 decorateRegionConfiguration(conf); 8726 this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); 8727 } 8728 } 8729 8730 /** 8731 * {@inheritDoc} 8732 */ 8733 @Override 8734 public void registerChildren(ConfigurationManager manager) { 8735 configurationManager = manager; 8736 stores.values().forEach(manager::registerObserver); 8737 } 8738 8739 /** 8740 * {@inheritDoc} 8741 */ 8742 @Override 8743 public void deregisterChildren(ConfigurationManager manager) { 8744 stores.values().forEach(configurationManager::deregisterObserver); 8745 } 8746 8747 @Override 8748 public CellComparator getCellComparator() { 8749 return cellComparator; 8750 } 8751 8752 public long getMemStoreFlushSize() { 8753 return this.memstoreFlushSize; 8754 } 8755 8756 //// method for debugging tests 8757 void throwException(String title, String regionName) { 8758 StringBuilder buf = new StringBuilder(); 8759 buf.append(title + ", "); 8760 buf.append(getRegionInfo().toString()); 8761 buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " "); 8762 buf.append("stores: "); 8763 for (HStore s : stores.values()) { 8764 buf.append(s.getColumnFamilyDescriptor().getNameAsString()); 8765 buf.append(" size: "); 8766 buf.append(s.getMemStoreSize().getDataSize()); 8767 buf.append(" "); 8768 } 8769 buf.append("end-of-stores"); 8770 buf.append(", memstore size "); 8771 buf.append(getMemStoreDataSize()); 8772 if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) { 8773 throw new RuntimeException(buf.toString()); 8774 } 8775 } 8776 8777 @Override 8778 public void requestCompaction(String why, int priority, boolean major, 8779 CompactionLifeCycleTracker tracker) throws IOException { 8780 if (major) { 8781 stores.values().forEach(HStore::triggerMajorCompaction); 8782 } 8783 rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker, 8784 RpcServer.getRequestUser().orElse(null)); 8785 } 8786 8787 @Override 8788 public void requestCompaction(byte[] family, String why, int priority, boolean major, 8789 CompactionLifeCycleTracker tracker) throws IOException { 8790 HStore store = stores.get(family); 8791 if (store == null) { 8792 throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family) 8793 + " does not exist in region " + getRegionInfo().getRegionNameAsString()); 8794 } 8795 if (major) { 8796 store.triggerMajorCompaction(); 8797 } 8798 rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker, 8799 RpcServer.getRequestUser().orElse(null)); 8800 } 8801 8802 private void requestFlushIfNeeded() throws RegionTooBusyException { 8803 if (isFlushSize(this.memStoreSizing.getMemStoreSize())) { 8804 requestFlush(); 8805 } 8806 } 8807 8808 private void requestFlush() { 8809 if (this.rsServices == null) { 8810 return; 8811 } 8812 requestFlush0(FlushLifeCycleTracker.DUMMY); 8813 } 8814 8815 private void requestFlush0(FlushLifeCycleTracker tracker) { 8816 boolean shouldFlush = false; 8817 synchronized (writestate) { 8818 if (!this.writestate.isFlushRequested()) { 8819 shouldFlush = true; 8820 writestate.flushRequested = true; 8821 } 8822 } 8823 if (shouldFlush) { 8824 // Make request outside of synchronize block; HBASE-818. 8825 this.rsServices.getFlushRequester().requestFlush(this, tracker); 8826 if (LOG.isDebugEnabled()) { 8827 LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName()); 8828 } 8829 } else { 8830 tracker.notExecuted("Flush already requested on " + this); 8831 } 8832 } 8833 8834 @Override 8835 public void requestFlush(FlushLifeCycleTracker tracker) throws IOException { 8836 requestFlush0(tracker); 8837 } 8838 8839 /** 8840 * This method modifies the region's configuration in order to inject replication-related features 8841 * @param conf region configurations 8842 */ 8843 private static void decorateRegionConfiguration(Configuration conf) { 8844 if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) { 8845 String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, ""); 8846 String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName(); 8847 if (!plugins.contains(replicationCoprocessorClass)) { 8848 conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, 8849 (plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass); 8850 } 8851 } 8852 } 8853 8854 public Optional<RegionReplicationSink> getRegionReplicationSink() { 8855 return regionReplicationSink; 8856 } 8857 8858 public void addReadRequestsCount(long readRequestsCount) { 8859 this.readRequestsCount.add(readRequestsCount); 8860 } 8861 8862 public void addWriteRequestsCount(long writeRequestsCount) { 8863 this.writeRequestsCount.add(writeRequestsCount); 8864 } 8865 8866 @RestrictedApi(explanation = "Should only be called in tests", link = "", 8867 allowedOnPath = ".*/src/test/.*") 8868 boolean isReadsEnabled() { 8869 return this.writestate.readsEnabled; 8870 } 8871}