001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.BLOCK_CACHE_KEY_KEY;
021
022import io.opentelemetry.api.common.Attributes;
023import io.opentelemetry.api.trace.Span;
024import java.io.DataInput;
025import java.io.IOException;
026import java.nio.ByteBuffer;
027import java.util.ArrayList;
028import java.util.Optional;
029import java.util.function.IntConsumer;
030import org.apache.hadoop.conf.Configurable;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.Path;
033import org.apache.hadoop.hbase.ByteBufferKeyOnlyKeyValue;
034import org.apache.hadoop.hbase.Cell;
035import org.apache.hadoop.hbase.CellComparator;
036import org.apache.hadoop.hbase.CellUtil;
037import org.apache.hadoop.hbase.ExtendedCell;
038import org.apache.hadoop.hbase.HBaseInterfaceAudience;
039import org.apache.hadoop.hbase.HConstants;
040import org.apache.hadoop.hbase.KeyValue;
041import org.apache.hadoop.hbase.PrivateCellUtil;
042import org.apache.hadoop.hbase.SizeCachedByteBufferKeyValue;
043import org.apache.hadoop.hbase.SizeCachedKeyValue;
044import org.apache.hadoop.hbase.SizeCachedNoTagsByteBufferKeyValue;
045import org.apache.hadoop.hbase.SizeCachedNoTagsKeyValue;
046import org.apache.hadoop.hbase.io.compress.Compression;
047import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
048import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
049import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
050import org.apache.hadoop.hbase.monitoring.ThreadLocalServerSideScanMetrics;
051import org.apache.hadoop.hbase.nio.ByteBuff;
052import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
053import org.apache.hadoop.hbase.util.ByteBufferUtils;
054import org.apache.hadoop.hbase.util.Bytes;
055import org.apache.hadoop.hbase.util.IdLock;
056import org.apache.hadoop.hbase.util.ObjectIntPair;
057import org.apache.hadoop.io.WritableUtils;
058import org.apache.yetus.audience.InterfaceAudience;
059import org.slf4j.Logger;
060import org.slf4j.LoggerFactory;
061
062/**
063 * Implementation that can handle all hfile versions of {@link HFile.Reader}.
064 */
065@InterfaceAudience.Private
066@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
067public abstract class HFileReaderImpl implements HFile.Reader, Configurable {
068  // This class is HFileReaderV3 + HFileReaderV2 + AbstractHFileReader all squashed together into
069  // one file. Ditto for all the HFileReader.ScannerV? implementations. I was running up against
070  // the MaxInlineLevel limit because too many tiers involved reading from an hfile. Was also hard
071  // to navigate the source code when so many classes participating in read.
072  private static final Logger LOG = LoggerFactory.getLogger(HFileReaderImpl.class);
073
074  /** Data block index reader keeping the root data index in memory */
075  protected HFileBlockIndex.CellBasedKeyBlockIndexReader dataBlockIndexReader;
076
077  /** Meta block index reader -- always single level */
078  protected HFileBlockIndex.ByteArrayKeyBlockIndexReader metaBlockIndexReader;
079
080  protected FixedFileTrailer trailer;
081
082  private final boolean primaryReplicaReader;
083
084  /**
085   * What kind of data block encoding should be used while reading, writing, and handling cache.
086   */
087  protected HFileDataBlockEncoder dataBlockEncoder = NoOpDataBlockEncoder.INSTANCE;
088
089  /** Block cache configuration. */
090  protected final CacheConfig cacheConf;
091
092  protected ReaderContext context;
093
094  protected final HFileInfo fileInfo;
095
096  /** Path of file */
097  protected final Path path;
098
099  /** File name to be used for block names */
100  protected final String name;
101
102  private Configuration conf;
103
104  protected HFileContext hfileContext;
105
106  /** Filesystem-level block reader. */
107  protected HFileBlock.FSReader fsBlockReader;
108
109  /**
110   * A "sparse lock" implementation allowing to lock on a particular block identified by offset. The
111   * purpose of this is to avoid two clients loading the same block, and have all but one client
112   * wait to get the block from the cache.
113   */
114  private IdLock offsetLock = new IdLock();
115
116  /** Minimum minor version supported by this HFile format */
117  static final int MIN_MINOR_VERSION = 0;
118
119  /** Maximum minor version supported by this HFile format */
120  // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
121  // the file. This version can read Writables version 1.
122  static final int MAX_MINOR_VERSION = 3;
123
124  /** Minor versions starting with this number have faked index key */
125  static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
126
127  /**
128   * Opens a HFile.
129   * @param context   Reader context info
130   * @param fileInfo  HFile info
131   * @param cacheConf Cache configuration.
132   * @param conf      Configuration
133   */
134  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
135  public HFileReaderImpl(ReaderContext context, HFileInfo fileInfo, CacheConfig cacheConf,
136    Configuration conf) throws IOException {
137    this.cacheConf = cacheConf;
138    this.context = context;
139    this.path = context.getFilePath();
140    this.name = path.getName();
141    this.conf = conf;
142    this.primaryReplicaReader = context.isPrimaryReplicaReader();
143    this.fileInfo = fileInfo;
144    this.trailer = fileInfo.getTrailer();
145    this.hfileContext = fileInfo.getHFileContext();
146    this.fsBlockReader =
147      new HFileBlock.FSReaderImpl(context, hfileContext, cacheConf.getByteBuffAllocator(), conf);
148    this.dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
149    fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
150    dataBlockIndexReader = fileInfo.getDataBlockIndexReader();
151    metaBlockIndexReader = fileInfo.getMetaBlockIndexReader();
152  }
153
154  @SuppressWarnings("serial")
155  public static class BlockIndexNotLoadedException extends IllegalStateException {
156    public BlockIndexNotLoadedException(Path path) {
157      // Add a message in case anyone relies on it as opposed to class name.
158      super(path + " block index not loaded");
159    }
160  }
161
162  public CacheConfig getCacheConf() {
163    return cacheConf;
164  }
165
166  private Optional<String> toStringFirstKey() {
167    return getFirstKey().map(CellUtil::getCellKeyAsString);
168  }
169
170  private Optional<String> toStringLastKey() {
171    return getLastKey().map(CellUtil::getCellKeyAsString);
172  }
173
174  @Override
175  public String toString() {
176    return "reader=" + path.toString()
177      + (!isFileInfoLoaded()
178        ? ""
179        : ", compression=" + trailer.getCompressionCodec().getName() + ", cacheConf=" + cacheConf
180          + ", firstKey=" + toStringFirstKey() + ", lastKey=" + toStringLastKey())
181      + ", avgKeyLen=" + fileInfo.getAvgKeyLen() + ", avgValueLen=" + fileInfo.getAvgValueLen()
182      + ", entries=" + trailer.getEntryCount() + ", length=" + context.getFileSize();
183  }
184
185  @Override
186  public long length() {
187    return context.getFileSize();
188  }
189
190  /**
191   * @return the first key in the file. May be null if file has no entries. Note that this is not
192   *         the first row key, but rather the byte form of the first KeyValue.
193   */
194  @Override
195  public Optional<ExtendedCell> getFirstKey() {
196    if (dataBlockIndexReader == null) {
197      throw new BlockIndexNotLoadedException(path);
198    }
199    return dataBlockIndexReader.isEmpty()
200      ? Optional.empty()
201      : Optional.of(dataBlockIndexReader.getRootBlockKey(0));
202  }
203
204  /**
205   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
206   * eliminate {@link KeyValue} here.
207   * @return the first row key, or null if the file is empty.
208   */
209  @Override
210  public Optional<byte[]> getFirstRowKey() {
211    // We have to copy the row part to form the row key alone
212    return getFirstKey().map(CellUtil::cloneRow);
213  }
214
215  /**
216   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
217   * eliminate {@link KeyValue} here.
218   * @return the last row key, or null if the file is empty.
219   */
220  @Override
221  public Optional<byte[]> getLastRowKey() {
222    // We have to copy the row part to form the row key alone
223    return getLastKey().map(CellUtil::cloneRow);
224  }
225
226  /** Returns number of KV entries in this HFile */
227  @Override
228  public long getEntries() {
229    return trailer.getEntryCount();
230  }
231
232  /** Returns comparator */
233  @Override
234  public CellComparator getComparator() {
235    return this.hfileContext.getCellComparator();
236  }
237
238  public Compression.Algorithm getCompressionAlgorithm() {
239    return trailer.getCompressionCodec();
240  }
241
242  /**
243   * @return the total heap size of data and meta block indexes in bytes. Does not take into account
244   *         non-root blocks of a multilevel data index.
245   */
246  @Override
247  public long indexSize() {
248    return (dataBlockIndexReader != null ? dataBlockIndexReader.heapSize() : 0)
249      + ((metaBlockIndexReader != null) ? metaBlockIndexReader.heapSize() : 0);
250  }
251
252  @Override
253  public String getName() {
254    return name;
255  }
256
257  @Override
258  public void setDataBlockEncoder(HFileDataBlockEncoder dataBlockEncoder) {
259    this.dataBlockEncoder = dataBlockEncoder;
260    this.fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
261  }
262
263  @Override
264  public void setDataBlockIndexReader(HFileBlockIndex.CellBasedKeyBlockIndexReader reader) {
265    this.dataBlockIndexReader = reader;
266  }
267
268  @Override
269  public HFileBlockIndex.CellBasedKeyBlockIndexReader getDataBlockIndexReader() {
270    return dataBlockIndexReader;
271  }
272
273  @Override
274  public void setMetaBlockIndexReader(HFileBlockIndex.ByteArrayKeyBlockIndexReader reader) {
275    this.metaBlockIndexReader = reader;
276  }
277
278  @Override
279  public HFileBlockIndex.ByteArrayKeyBlockIndexReader getMetaBlockIndexReader() {
280    return metaBlockIndexReader;
281  }
282
283  @Override
284  public FixedFileTrailer getTrailer() {
285    return trailer;
286  }
287
288  @Override
289  public ReaderContext getContext() {
290    return this.context;
291  }
292
293  @Override
294  public HFileInfo getHFileInfo() {
295    return this.fileInfo;
296  }
297
298  @Override
299  public boolean isPrimaryReplicaReader() {
300    return primaryReplicaReader;
301  }
302
303  /**
304   * An exception thrown when an operation requiring a scanner to be seeked is invoked on a scanner
305   * that is not seeked.
306   */
307  @SuppressWarnings("serial")
308  public static class NotSeekedException extends IllegalStateException {
309    public NotSeekedException(Path path) {
310      super(path + " not seeked to a key/value");
311    }
312  }
313
314  public static class HFileScannerImpl implements HFileScanner {
315    private ByteBuff blockBuffer;
316    protected final boolean cacheBlocks;
317    protected final boolean pread;
318    protected final boolean isCompaction;
319    private int currKeyLen;
320    private int currValueLen;
321    private int currMemstoreTSLen;
322    private long currMemstoreTS;
323    protected final HFile.Reader reader;
324    private int currTagsLen;
325    private short rowLen;
326    // buffer backed keyonlyKV
327    private ByteBufferKeyOnlyKeyValue bufBackedKeyOnlyKv = new ByteBufferKeyOnlyKeyValue();
328    // A pair for reusing in blockSeek() so that we don't garbage lot of objects
329    final ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<>();
330
331    /**
332     * The next indexed key is to keep track of the indexed key of the next data block. If the
333     * nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the current data block is the
334     * last data block. If the nextIndexedKey is null, it means the nextIndexedKey has not been
335     * loaded yet.
336     */
337    protected ExtendedCell nextIndexedKey;
338
339    // Current block being used. NOTICE: DON't release curBlock separately except in shipped() or
340    // close() methods. Because the shipped() or close() will do the release finally, even if any
341    // exception occur the curBlock will be released by the close() method (see
342    // RegionScannerImpl#handleException). Call the releaseIfNotCurBlock() to release the
343    // unreferenced block please.
344    protected HFileBlock curBlock;
345    // Whether we returned a result for curBlock's size in recordBlockSize().
346    // gets reset whenever curBlock is changed.
347    private boolean providedCurrentBlockSize = false;
348
349    public HFileBlock getCurBlock() {
350      return curBlock;
351    }
352
353    // Previous blocks that were used in the course of the read
354    protected final ArrayList<HFileBlock> prevBlocks = new ArrayList<>();
355
356    public HFileScannerImpl(final HFile.Reader reader, final boolean cacheBlocks,
357      final boolean pread, final boolean isCompaction) {
358      this.reader = reader;
359      this.cacheBlocks = cacheBlocks;
360      this.pread = pread;
361      this.isCompaction = isCompaction;
362    }
363
364    void updateCurrBlockRef(HFileBlock block) {
365      if (block != null && curBlock != null && block.getOffset() == curBlock.getOffset()) {
366        return;
367      }
368      if (this.curBlock != null && this.curBlock.isSharedMem()) {
369        prevBlocks.add(this.curBlock);
370      }
371      this.curBlock = block;
372      this.providedCurrentBlockSize = false;
373    }
374
375    void reset() {
376      // We don't have to keep ref to heap block
377      if (this.curBlock != null && this.curBlock.isSharedMem()) {
378        this.prevBlocks.add(this.curBlock);
379      }
380      this.curBlock = null;
381    }
382
383    private void returnBlocks(boolean returnAll) {
384      this.prevBlocks.forEach(HFileBlock::release);
385      this.prevBlocks.clear();
386      if (returnAll && this.curBlock != null) {
387        this.curBlock.release();
388        this.curBlock = null;
389      }
390    }
391
392    @Override
393    public boolean isSeeked() {
394      return blockBuffer != null;
395    }
396
397    @Override
398    public String toString() {
399      return "HFileScanner for reader " + String.valueOf(getReader());
400    }
401
402    protected void assertSeeked() {
403      if (!isSeeked()) {
404        throw new NotSeekedException(reader.getPath());
405      }
406    }
407
408    @Override
409    public HFile.Reader getReader() {
410      return reader;
411    }
412
413    // From non encoded HFiles, we always read back KeyValue or its descendant.(Note: When HFile
414    // block is in DBB, it will be OffheapKV). So all parts of the Cell is in a contiguous
415    // array/buffer. How many bytes we should wrap to make the KV is what this method returns.
416    private int getKVBufSize() {
417      int kvBufSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen;
418      if (currTagsLen > 0) {
419        kvBufSize += Bytes.SIZEOF_SHORT + currTagsLen;
420      }
421      return kvBufSize;
422    }
423
424    @Override
425    public void close() {
426      if (!pread) {
427        // For seek + pread stream socket should be closed when the scanner is closed. HBASE-9393
428        reader.unbufferStream();
429      }
430      this.returnBlocks(true);
431    }
432
433    @Override
434    public void recordBlockSize(IntConsumer blockSizeConsumer) {
435      if (!providedCurrentBlockSize && curBlock != null) {
436        providedCurrentBlockSize = true;
437        blockSizeConsumer.accept(curBlock.getUncompressedSizeWithoutHeader());
438      }
439    }
440
441    // Returns the #bytes in HFile for the current cell. Used to skip these many bytes in current
442    // HFile block's buffer so as to position to the next cell.
443    private int getCurCellSerializedSize() {
444      int curCellSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen + currMemstoreTSLen;
445      if (this.reader.getFileContext().isIncludesTags()) {
446        curCellSize += Bytes.SIZEOF_SHORT + currTagsLen;
447      }
448      return curCellSize;
449    }
450
451    protected void readKeyValueLen() {
452      // This is a hot method. We go out of our way to make this method short so it can be
453      // inlined and is not too big to compile. We also manage position in ByteBuffer ourselves
454      // because it is faster than going via range-checked ByteBuffer methods or going through a
455      // byte buffer array a byte at a time.
456      // Get a long at a time rather than read two individual ints. In micro-benchmarking, even
457      // with the extra bit-fiddling, this is order-of-magnitude faster than getting two ints.
458      // Trying to imitate what was done - need to profile if this is better or
459      // earlier way is better by doing mark and reset?
460      // But ensure that you read long instead of two ints
461      long ll = blockBuffer.getLongAfterPosition(0);
462      // Read top half as an int of key length and bottom int as value length
463      this.currKeyLen = (int) (ll >> Integer.SIZE);
464      this.currValueLen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
465      checkKeyValueLen();
466      this.rowLen = blockBuffer.getShortAfterPosition(Bytes.SIZEOF_LONG);
467      // Move position past the key and value lengths and then beyond the key and value
468      int p = (Bytes.SIZEOF_LONG + currKeyLen + currValueLen);
469      if (reader.getFileContext().isIncludesTags()) {
470        // Tags length is a short.
471        this.currTagsLen = blockBuffer.getShortAfterPosition(p);
472        checkTagsLen();
473        p += (Bytes.SIZEOF_SHORT + currTagsLen);
474      }
475      readMvccVersion(p);
476    }
477
478    private final void checkTagsLen() {
479      if (checkLen(this.currTagsLen)) {
480        throw new IllegalStateException(
481          "Invalid currTagsLen " + this.currTagsLen + ". Block offset: " + curBlock.getOffset()
482            + ", block length: " + this.blockBuffer.limit() + ", position: "
483            + this.blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
484      }
485    }
486
487    /**
488     * Read mvcc. Does checks to see if we even need to read the mvcc at all.
489     */
490    protected void readMvccVersion(final int offsetFromPos) {
491      // See if we even need to decode mvcc.
492      if (!this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
493        return;
494      }
495      if (!this.reader.getHFileInfo().isDecodeMemstoreTS()) {
496        currMemstoreTS = 0;
497        currMemstoreTSLen = 1;
498        return;
499      }
500      _readMvccVersion(offsetFromPos);
501    }
502
503    /**
504     * Actually do the mvcc read. Does no checks.
505     */
506    private void _readMvccVersion(int offsetFromPos) {
507      // This is Bytes#bytesToVint inlined so can save a few instructions in this hot method; i.e.
508      // previous if one-byte vint, we'd redo the vint call to find int size.
509      // Also the method is kept small so can be inlined.
510      byte firstByte = blockBuffer.getByteAfterPosition(offsetFromPos);
511      int len = WritableUtils.decodeVIntSize(firstByte);
512      if (len == 1) {
513        this.currMemstoreTS = firstByte;
514      } else {
515        int remaining = len - 1;
516        long i = 0;
517        offsetFromPos++;
518        if (remaining >= Bytes.SIZEOF_INT) {
519          // The int read has to be converted to unsigned long so the & op
520          i = (blockBuffer.getIntAfterPosition(offsetFromPos) & 0x00000000ffffffffL);
521          remaining -= Bytes.SIZEOF_INT;
522          offsetFromPos += Bytes.SIZEOF_INT;
523        }
524        if (remaining >= Bytes.SIZEOF_SHORT) {
525          short s = blockBuffer.getShortAfterPosition(offsetFromPos);
526          i = i << 16;
527          i = i | (s & 0xFFFF);
528          remaining -= Bytes.SIZEOF_SHORT;
529          offsetFromPos += Bytes.SIZEOF_SHORT;
530        }
531        for (int idx = 0; idx < remaining; idx++) {
532          byte b = blockBuffer.getByteAfterPosition(offsetFromPos + idx);
533          i = i << 8;
534          i = i | (b & 0xFF);
535        }
536        currMemstoreTS = (WritableUtils.isNegativeVInt(firstByte) ? ~i : i);
537      }
538      this.currMemstoreTSLen = len;
539    }
540
541    /**
542     * Within a loaded block, seek looking for the last key that is smaller than (or equal to?) the
543     * key we are interested in. A note on the seekBefore: if you have seekBefore = true, AND the
544     * first key in the block = key, then you'll get thrown exceptions. The caller has to check for
545     * that case and load the previous block as appropriate. the key to find find the key before the
546     * given key in case of exact match.
547     * @return 0 in case of an exact key match, 1 in case of an inexact match, -2 in case of an
548     *         inexact match and furthermore, the input key less than the first key of current
549     *         block(e.g. using a faked index key)
550     */
551    protected int blockSeek(Cell key, boolean seekBefore) {
552      int klen, vlen, tlen = 0;
553      int lastKeyValueSize = -1;
554      int offsetFromPos;
555      do {
556        offsetFromPos = 0;
557        // Better to ensure that we use the BB Utils here
558        long ll = blockBuffer.getLongAfterPosition(offsetFromPos);
559        klen = (int) (ll >> Integer.SIZE);
560        vlen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
561        if (checkKeyLen(klen) || checkLen(vlen)) {
562          throw new IllegalStateException(
563            "Invalid klen " + klen + " or vlen " + vlen + ". Block offset: " + curBlock.getOffset()
564              + ", block length: " + blockBuffer.limit() + ", position: " + blockBuffer.position()
565              + " (without header)." + " path=" + reader.getPath());
566        }
567        offsetFromPos += Bytes.SIZEOF_LONG;
568        this.rowLen = blockBuffer.getShortAfterPosition(offsetFromPos);
569        blockBuffer.asSubByteBuffer(blockBuffer.position() + offsetFromPos, klen, pair);
570        bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), klen, rowLen);
571        int comp =
572          PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, bufBackedKeyOnlyKv);
573        offsetFromPos += klen + vlen;
574        if (this.reader.getFileContext().isIncludesTags()) {
575          // Read short as unsigned, high byte first
576          tlen = ((blockBuffer.getByteAfterPosition(offsetFromPos) & 0xff) << 8)
577            ^ (blockBuffer.getByteAfterPosition(offsetFromPos + 1) & 0xff);
578          if (checkLen(tlen)) {
579            throw new IllegalStateException("Invalid tlen " + tlen + ". Block offset: "
580              + curBlock.getOffset() + ", block length: " + blockBuffer.limit() + ", position: "
581              + blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
582          }
583          // add the two bytes read for the tags.
584          offsetFromPos += tlen + (Bytes.SIZEOF_SHORT);
585        }
586        if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
587          // Directly read the mvcc based on current position
588          readMvccVersion(offsetFromPos);
589        }
590        if (comp == 0) {
591          if (seekBefore) {
592            if (lastKeyValueSize < 0) {
593              throw new IllegalStateException("blockSeek with seekBefore "
594                + "at the first key of the block: key=" + CellUtil.getCellKeyAsString(key)
595                + ", blockOffset=" + curBlock.getOffset() + ", onDiskSize="
596                + curBlock.getOnDiskSizeWithHeader() + ", path=" + reader.getPath());
597            }
598            blockBuffer.moveBack(lastKeyValueSize);
599            readKeyValueLen();
600            return 1; // non exact match.
601          }
602          currKeyLen = klen;
603          currValueLen = vlen;
604          currTagsLen = tlen;
605          return 0; // indicate exact match
606        } else if (comp < 0) {
607          if (lastKeyValueSize > 0) {
608            blockBuffer.moveBack(lastKeyValueSize);
609          }
610          readKeyValueLen();
611          if (lastKeyValueSize == -1 && blockBuffer.position() == 0) {
612            return HConstants.INDEX_KEY_MAGIC;
613          }
614          return 1;
615        }
616        // The size of this key/value tuple, including key/value length fields.
617        lastKeyValueSize = klen + vlen + currMemstoreTSLen + KEY_VALUE_LEN_SIZE;
618        // include tag length also if tags included with KV
619        if (reader.getFileContext().isIncludesTags()) {
620          lastKeyValueSize += tlen + Bytes.SIZEOF_SHORT;
621        }
622        blockBuffer.skip(lastKeyValueSize);
623      } while (blockBuffer.hasRemaining());
624
625      // Seek to the last key we successfully read. This will happen if this is
626      // the last key/value pair in the file, in which case the following call
627      // to next() has to return false.
628      blockBuffer.moveBack(lastKeyValueSize);
629      readKeyValueLen();
630      return 1; // didn't exactly find it.
631    }
632
633    @Override
634    public ExtendedCell getNextIndexedKey() {
635      return nextIndexedKey;
636    }
637
638    @Override
639    public int seekTo(ExtendedCell key) throws IOException {
640      return seekTo(key, true);
641    }
642
643    @Override
644    public int reseekTo(ExtendedCell key) throws IOException {
645      int compared;
646      if (isSeeked()) {
647        compared = compareKey(reader.getComparator(), key);
648        if (compared < 1) {
649          // If the required key is less than or equal to current key, then
650          // don't do anything.
651          return compared;
652        } else {
653          // The comparison with no_next_index_key has to be checked
654          if (
655            this.nextIndexedKey != null && (this.nextIndexedKey
656                == KeyValueScanner.NO_NEXT_INDEXED_KEY
657              || PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, nextIndexedKey)
658                  < 0)
659          ) {
660            // The reader shall continue to scan the current data block instead
661            // of querying the
662            // block index as long as it knows the target key is strictly
663            // smaller than
664            // the next indexed key or the current data block is the last data
665            // block.
666            return loadBlockAndSeekToKey(this.curBlock, nextIndexedKey, false, key, false);
667          }
668        }
669      }
670      // Don't rewind on a reseek operation, because reseek implies that we are
671      // always going forward in the file.
672      return seekTo(key, false);
673    }
674
675    /**
676     * An internal API function. Seek to the given key, optionally rewinding to the first key of the
677     * block before doing the seek.
678     * @param key    - a cell representing the key that we need to fetch
679     * @param rewind whether to rewind to the first key of the block before doing the seek. If this
680     *               is false, we are assuming we never go back, otherwise the result is undefined.
681     * @return -1 if the key is earlier than the first key of the file, 0 if we are at the given
682     *         key, 1 if we are past the given key -2 if the key is earlier than the first key of
683     *         the file while using a faked index key
684     */
685    public int seekTo(ExtendedCell key, boolean rewind) throws IOException {
686      HFileBlockIndex.BlockIndexReader indexReader = reader.getDataBlockIndexReader();
687      BlockWithScanInfo blockWithScanInfo = indexReader.loadDataBlockWithScanInfo(key, curBlock,
688        cacheBlocks, pread, isCompaction, getEffectiveDataBlockEncoding(), reader);
689      if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
690        // This happens if the key e.g. falls before the beginning of the file.
691        return -1;
692      }
693      return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
694        blockWithScanInfo.getNextIndexedKey(), rewind, key, false);
695    }
696
697    @Override
698    public boolean seekBefore(ExtendedCell key) throws IOException {
699      HFileBlock seekToBlock = reader.getDataBlockIndexReader().seekToDataBlock(key, curBlock,
700        cacheBlocks, pread, isCompaction, reader.getEffectiveEncodingInCache(isCompaction), reader);
701      if (seekToBlock == null) {
702        return false;
703      }
704      ExtendedCell firstKey = getFirstKeyCellInBlock(seekToBlock);
705      if (PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), firstKey, key) >= 0) {
706        long previousBlockOffset = seekToBlock.getPrevBlockOffset();
707        // The key we are interested in
708        if (previousBlockOffset == -1) {
709          // we have a 'problem', the key we want is the first of the file.
710          releaseIfNotCurBlock(seekToBlock);
711          return false;
712        }
713
714        // The first key in the current block 'seekToBlock' is greater than the given
715        // seekBefore key. We will go ahead by reading the next block that satisfies the
716        // given key. Return the current block before reading the next one.
717        releaseIfNotCurBlock(seekToBlock);
718        // It is important that we compute and pass onDiskSize to the block
719        // reader so that it does not have to read the header separately to
720        // figure out the size. Currently, we do not have a way to do this
721        // correctly in the general case however.
722        // TODO: See https://issues.apache.org/jira/browse/HBASE-14576
723        int prevBlockSize = -1;
724        seekToBlock = reader.readBlock(previousBlockOffset, prevBlockSize, cacheBlocks, pread,
725          isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
726        // TODO shortcut: seek forward in this block to the last key of the
727        // block.
728      }
729      loadBlockAndSeekToKey(seekToBlock, firstKey, true, key, true);
730      return true;
731    }
732
733    /**
734     * The curBlock will be released by shipping or close method, so only need to consider releasing
735     * the block, which was read from HFile before and not referenced by curBlock.
736     */
737    protected void releaseIfNotCurBlock(HFileBlock block) {
738      if (curBlock != block) {
739        block.release();
740      }
741    }
742
743    /**
744     * Scans blocks in the "scanned" section of the {@link HFile} until the next data block is
745     * found.
746     * @return the next block, or null if there are no more data blocks
747     */
748    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_NULL_ON_SOME_PATH",
749        justification = "Yeah, unnecessary null check; could do w/ clean up")
750    protected HFileBlock readNextDataBlock() throws IOException {
751      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
752      if (curBlock == null) {
753        return null;
754      }
755      HFileBlock block = this.curBlock;
756      do {
757        if (block.getOffset() >= lastDataBlockOffset) {
758          releaseIfNotCurBlock(block);
759          return null;
760        }
761        if (block.getOffset() < 0) {
762          releaseIfNotCurBlock(block);
763          throw new IOException("Invalid block offset: " + block + ", path=" + reader.getPath());
764        }
765        // We are reading the next block without block type validation, because
766        // it might turn out to be a non-data block.
767        block = reader.readBlock(block.getOffset() + block.getOnDiskSizeWithHeader(),
768          block.getNextBlockOnDiskSize(), cacheBlocks, pread, isCompaction, true, null,
769          getEffectiveDataBlockEncoding());
770        if (block != null && !block.getBlockType().isData()) {
771          // Whatever block we read we will be returning it unless
772          // it is a datablock. Just in case the blocks are non data blocks
773          block.release();
774        }
775      } while (!block.getBlockType().isData());
776      return block;
777    }
778
779    public DataBlockEncoding getEffectiveDataBlockEncoding() {
780      return this.reader.getEffectiveEncodingInCache(isCompaction);
781    }
782
783    @Override
784    public ExtendedCell getCell() {
785      if (!isSeeked()) {
786        return null;
787      }
788
789      ExtendedCell ret;
790      int cellBufSize = getKVBufSize();
791      long seqId = 0L;
792      if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
793        seqId = currMemstoreTS;
794      }
795      if (blockBuffer.hasArray()) {
796        // TODO : reduce the varieties of KV here. Check if based on a boolean
797        // we can handle the 'no tags' case.
798        if (currTagsLen > 0) {
799          ret = new SizeCachedKeyValue(blockBuffer.array(),
800            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
801            rowLen);
802        } else {
803          ret = new SizeCachedNoTagsKeyValue(blockBuffer.array(),
804            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
805            rowLen);
806        }
807      } else {
808        ByteBuffer buf = blockBuffer.asSubByteBuffer(cellBufSize);
809        if (buf.isDirect()) {
810          ret = currTagsLen > 0
811            ? new SizeCachedByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId, currKeyLen,
812              rowLen)
813            : new SizeCachedNoTagsByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId,
814              currKeyLen, rowLen);
815        } else {
816          if (currTagsLen > 0) {
817            ret = new SizeCachedKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
818              cellBufSize, seqId, currKeyLen, rowLen);
819          } else {
820            ret = new SizeCachedNoTagsKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
821              cellBufSize, seqId, currKeyLen, rowLen);
822          }
823        }
824      }
825      return ret;
826    }
827
828    @Override
829    public ExtendedCell getKey() {
830      assertSeeked();
831      // Create a new object so that this getKey is cached as firstKey, lastKey
832      ObjectIntPair<ByteBuffer> keyPair = new ObjectIntPair<>();
833      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, keyPair);
834      ByteBuffer keyBuf = keyPair.getFirst();
835      if (keyBuf.hasArray()) {
836        return new KeyValue.KeyOnlyKeyValue(keyBuf.array(),
837          keyBuf.arrayOffset() + keyPair.getSecond(), currKeyLen);
838      } else {
839        // Better to do a copy here instead of holding on to this BB so that
840        // we could release the blocks referring to this key. This key is specifically used
841        // in HalfStoreFileReader to get the firstkey and lastkey by creating a new scanner
842        // every time. So holding onto the BB (incase of DBB) is not advised here.
843        byte[] key = new byte[currKeyLen];
844        ByteBufferUtils.copyFromBufferToArray(key, keyBuf, keyPair.getSecond(), 0, currKeyLen);
845        return new KeyValue.KeyOnlyKeyValue(key, 0, currKeyLen);
846      }
847    }
848
849    @Override
850    public ByteBuffer getValue() {
851      assertSeeked();
852      // Okie to create new Pair. Not used in hot path
853      ObjectIntPair<ByteBuffer> valuePair = new ObjectIntPair<>();
854      this.blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
855        currValueLen, valuePair);
856      ByteBuffer valBuf = valuePair.getFirst().duplicate();
857      valBuf.position(valuePair.getSecond());
858      valBuf.limit(currValueLen + valuePair.getSecond());
859      return valBuf.slice();
860    }
861
862    protected void setNonSeekedState() {
863      reset();
864      blockBuffer = null;
865      currKeyLen = 0;
866      currValueLen = 0;
867      currMemstoreTS = 0;
868      currMemstoreTSLen = 0;
869      currTagsLen = 0;
870    }
871
872    /**
873     * Set the position on current backing blockBuffer.
874     */
875    private void positionThisBlockBuffer() {
876      try {
877        blockBuffer.skip(getCurCellSerializedSize());
878      } catch (IllegalArgumentException e) {
879        LOG.error("Current pos = " + blockBuffer.position() + "; currKeyLen = " + currKeyLen
880          + "; currValLen = " + currValueLen + "; block limit = " + blockBuffer.limit()
881          + "; currBlock currBlockOffset = " + this.curBlock.getOffset() + "; path="
882          + reader.getPath());
883        throw e;
884      }
885    }
886
887    /**
888     * Set our selves up for the next 'next' invocation, set up next block.
889     * @return True is more to read else false if at the end.
890     */
891    private boolean positionForNextBlock() throws IOException {
892      // Methods are small so they get inlined because they are 'hot'.
893      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
894      if (this.curBlock.getOffset() >= lastDataBlockOffset) {
895        setNonSeekedState();
896        return false;
897      }
898      return isNextBlock();
899    }
900
901    private boolean isNextBlock() throws IOException {
902      // Methods are small so they get inlined because they are 'hot'.
903      HFileBlock nextBlock = readNextDataBlock();
904      if (nextBlock == null) {
905        setNonSeekedState();
906        return false;
907      }
908      updateCurrentBlock(nextBlock);
909      return true;
910    }
911
912    private final boolean _next() throws IOException {
913      // Small method so can be inlined. It is a hot one.
914      if (blockBuffer.remaining() <= 0) {
915        return positionForNextBlock();
916      }
917
918      // We are still in the same block.
919      readKeyValueLen();
920      return true;
921    }
922
923    /**
924     * Go to the next key/value in the block section. Loads the next block if necessary. If
925     * successful, {@link #getKey()} and {@link #getValue()} can be called.
926     * @return true if successfully navigated to the next key/value
927     */
928    @Override
929    public boolean next() throws IOException {
930      // This is a hot method so extreme measures taken to ensure it is small and inlineable.
931      // Checked by setting: -XX:+UnlockDiagnosticVMOptions -XX:+PrintInlining -XX:+PrintCompilation
932      assertSeeked();
933      positionThisBlockBuffer();
934      return _next();
935    }
936
937    /**
938     * Positions this scanner at the start of the file.
939     * @return false if empty file; i.e. a call to next would return false and the current key and
940     *         value are undefined.
941     */
942    @Override
943    public boolean seekTo() throws IOException {
944      if (reader == null) {
945        return false;
946      }
947
948      if (reader.getTrailer().getEntryCount() == 0) {
949        // No data blocks.
950        return false;
951      }
952
953      long firstDataBlockOffset = reader.getTrailer().getFirstDataBlockOffset();
954      if (curBlock != null && curBlock.getOffset() == firstDataBlockOffset) {
955        return processFirstDataBlock();
956      }
957
958      readAndUpdateNewBlock(firstDataBlockOffset);
959      return true;
960    }
961
962    protected boolean processFirstDataBlock() throws IOException {
963      blockBuffer.rewind();
964      readKeyValueLen();
965      return true;
966    }
967
968    protected void readAndUpdateNewBlock(long firstDataBlockOffset) throws IOException {
969      HFileBlock newBlock = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
970        isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
971      if (newBlock.getOffset() < 0) {
972        releaseIfNotCurBlock(newBlock);
973        throw new IOException(
974          "Invalid offset=" + newBlock.getOffset() + ", path=" + reader.getPath());
975      }
976      updateCurrentBlock(newBlock);
977    }
978
979    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, ExtendedCell nextIndexedKey,
980      boolean rewind, ExtendedCell key, boolean seekBefore) throws IOException {
981      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
982        updateCurrentBlock(seekToBlock);
983      } else if (rewind) {
984        blockBuffer.rewind();
985      }
986      // Update the nextIndexedKey
987      this.nextIndexedKey = nextIndexedKey;
988      return blockSeek(key, seekBefore);
989    }
990
991    /** Returns True if v &lt;= 0 or v &gt; current block buffer limit. */
992    protected final boolean checkKeyLen(final int v) {
993      return v <= 0 || v > this.blockBuffer.limit();
994    }
995
996    /** Returns True if v &lt; 0 or v &gt; current block buffer limit. */
997    protected final boolean checkLen(final int v) {
998      return v < 0 || v > this.blockBuffer.limit();
999    }
1000
1001    /**
1002     * Check key and value lengths are wholesome.
1003     */
1004    protected final void checkKeyValueLen() {
1005      if (checkKeyLen(this.currKeyLen) || checkLen(this.currValueLen)) {
1006        throw new IllegalStateException("Invalid currKeyLen " + this.currKeyLen
1007          + " or currValueLen " + this.currValueLen + ". Block offset: " + this.curBlock.getOffset()
1008          + ", block length: " + this.blockBuffer.limit() + ", position: "
1009          + this.blockBuffer.position() + " (without header)." + ", path=" + reader.getPath());
1010      }
1011    }
1012
1013    /**
1014     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1015     * key/value pair.
1016     * @param newBlock the block read by {@link HFileReaderImpl#readBlock}, it's a totally new block
1017     *                 with new allocated {@link ByteBuff}, so if no further reference to this
1018     *                 block, we should release it carefully.
1019     */
1020    protected void updateCurrentBlock(HFileBlock newBlock) throws IOException {
1021      try {
1022        if (newBlock.getBlockType() != BlockType.DATA) {
1023          throw new IllegalStateException(
1024            "ScannerV2 works only on data blocks, got " + newBlock.getBlockType() + "; "
1025              + "HFileName=" + reader.getPath() + ", " + "dataBlockEncoder="
1026              + reader.getDataBlockEncoding() + ", " + "isCompaction=" + isCompaction);
1027        }
1028        updateCurrBlockRef(newBlock);
1029        blockBuffer = newBlock.getBufferWithoutHeader();
1030        readKeyValueLen();
1031      } finally {
1032        releaseIfNotCurBlock(newBlock);
1033      }
1034      // Reset the next indexed key
1035      this.nextIndexedKey = null;
1036    }
1037
1038    protected ExtendedCell getFirstKeyCellInBlock(HFileBlock curBlock) {
1039      ByteBuff buffer = curBlock.getBufferWithoutHeader();
1040      // It is safe to manipulate this buffer because we own the buffer object.
1041      buffer.rewind();
1042      int klen = buffer.getInt();
1043      buffer.skip(Bytes.SIZEOF_INT);// Skip value len part
1044      ByteBuffer keyBuff = buffer.asSubByteBuffer(klen);
1045      if (keyBuff.hasArray()) {
1046        return new KeyValue.KeyOnlyKeyValue(keyBuff.array(),
1047          keyBuff.arrayOffset() + keyBuff.position(), klen);
1048      } else {
1049        return new ByteBufferKeyOnlyKeyValue(keyBuff, keyBuff.position(), klen);
1050      }
1051    }
1052
1053    public int compareKey(CellComparator comparator, ExtendedCell key) {
1054      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, pair);
1055      this.bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), currKeyLen, rowLen);
1056      return PrivateCellUtil.compareKeyIgnoresMvcc(comparator, key, this.bufBackedKeyOnlyKv);
1057    }
1058
1059    @Override
1060    public void shipped() throws IOException {
1061      this.returnBlocks(false);
1062    }
1063  }
1064
1065  @Override
1066  public Path getPath() {
1067    return path;
1068  }
1069
1070  @Override
1071  public DataBlockEncoding getDataBlockEncoding() {
1072    return dataBlockEncoder.getDataBlockEncoding();
1073  }
1074
1075  @Override
1076  public Configuration getConf() {
1077    return conf;
1078  }
1079
1080  @Override
1081  public void setConf(Configuration conf) {
1082    this.conf = conf;
1083  }
1084
1085  /** Minor versions in HFile starting with this number have hbase checksums */
1086  public static final int MINOR_VERSION_WITH_CHECKSUM = 1;
1087  /** In HFile minor version that does not support checksums */
1088  public static final int MINOR_VERSION_NO_CHECKSUM = 0;
1089
1090  /** HFile minor version that introduced pbuf filetrailer */
1091  public static final int PBUF_TRAILER_MINOR_VERSION = 2;
1092
1093  /**
1094   * The size of a (key length, value length) tuple that prefixes each entry in a data block.
1095   */
1096  public final static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
1097
1098  /**
1099   * Retrieve block from cache. Validates the retrieved block's type vs {@code expectedBlockType}
1100   * and its encoding vs. {@code expectedDataBlockEncoding}. Unpacks the block as necessary.
1101   */
1102  @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.UNITTEST)
1103  public HFileBlock getCachedBlock(BlockCacheKey cacheKey, boolean cacheBlock, boolean useLock,
1104    boolean updateCacheMetrics, BlockType expectedBlockType,
1105    DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1106    // Check cache for block. If found return.
1107    BlockCache cache = cacheConf.getBlockCache().orElse(null);
1108    long cachedBlockBytesRead = 0;
1109    if (cache != null) {
1110      HFileBlock cachedBlock = null;
1111      boolean isScanMetricsEnabled = ThreadLocalServerSideScanMetrics.isScanMetricsEnabled();
1112      try {
1113        cachedBlock = (HFileBlock) cache.getBlock(cacheKey, cacheBlock, useLock, updateCacheMetrics,
1114          expectedBlockType);
1115        if (cachedBlock != null) {
1116          if (cacheConf.shouldCacheCompressed(cachedBlock.getBlockType().getCategory())) {
1117            HFileBlock compressedBlock = cachedBlock;
1118            cachedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1119            // In case of compressed block after unpacking we can release the compressed block
1120            if (compressedBlock != cachedBlock) {
1121              compressedBlock.release();
1122            }
1123          }
1124          try {
1125            validateBlockType(cachedBlock, expectedBlockType);
1126          } catch (IOException e) {
1127            returnAndEvictBlock(cache, cacheKey, cachedBlock);
1128            cachedBlock = null;
1129            throw e;
1130          }
1131
1132          if (expectedDataBlockEncoding == null) {
1133            return cachedBlock;
1134          }
1135          DataBlockEncoding actualDataBlockEncoding = cachedBlock.getDataBlockEncoding();
1136          // Block types other than data blocks always have
1137          // DataBlockEncoding.NONE. To avoid false negative cache misses, only
1138          // perform this check if cached block is a data block.
1139          if (
1140            cachedBlock.getBlockType().isData()
1141              && !actualDataBlockEncoding.equals(expectedDataBlockEncoding)
1142          ) {
1143            // This mismatch may happen if a Scanner, which is used for say a
1144            // compaction, tries to read an encoded block from the block cache.
1145            // The reverse might happen when an EncodedScanner tries to read
1146            // un-encoded blocks which were cached earlier.
1147            //
1148            // Because returning a data block with an implicit BlockType mismatch
1149            // will cause the requesting scanner to throw a disk read should be
1150            // forced here. This will potentially cause a significant number of
1151            // cache misses, so update so we should keep track of this as it might
1152            // justify the work on a CompoundScanner.
1153            if (
1154              !expectedDataBlockEncoding.equals(DataBlockEncoding.NONE)
1155                && !actualDataBlockEncoding.equals(DataBlockEncoding.NONE)
1156            ) {
1157              // If the block is encoded but the encoding does not match the
1158              // expected encoding it is likely the encoding was changed but the
1159              // block was not yet evicted. Evictions on file close happen async
1160              // so blocks with the old encoding still linger in cache for some
1161              // period of time. This event should be rare as it only happens on
1162              // schema definition change.
1163              LOG.info(
1164                "Evicting cached block with key {} because data block encoding mismatch; "
1165                  + "expected {}, actual {}, path={}",
1166                cacheKey, actualDataBlockEncoding, expectedDataBlockEncoding, path);
1167              // This is an error scenario. so here we need to release the block.
1168              returnAndEvictBlock(cache, cacheKey, cachedBlock);
1169            }
1170            cachedBlock = null;
1171            return null;
1172          }
1173          return cachedBlock;
1174        }
1175      } finally {
1176        // Count bytes read as cached block is being returned
1177        if (isScanMetricsEnabled && cachedBlock != null) {
1178          cachedBlockBytesRead = cachedBlock.getOnDiskSizeWithHeader();
1179          // Account for the header size of the next block if it exists
1180          if (cachedBlock.getNextBlockOnDiskSize() > 0) {
1181            cachedBlockBytesRead += cachedBlock.headerSize();
1182          }
1183        }
1184        if (cachedBlockBytesRead > 0) {
1185          ThreadLocalServerSideScanMetrics.addBytesReadFromBlockCache(cachedBlockBytesRead);
1186        }
1187      }
1188    }
1189    return null;
1190  }
1191
1192  private void returnAndEvictBlock(BlockCache cache, BlockCacheKey cacheKey, Cacheable block) {
1193    block.release();
1194    cache.evictBlock(cacheKey);
1195  }
1196
1197  /**
1198   * @param cacheBlock Add block to cache, if found
1199   * @return block wrapped in a ByteBuffer, with header skipped
1200   */
1201  @Override
1202  public HFileBlock getMetaBlock(String metaBlockName, boolean cacheBlock) throws IOException {
1203    if (trailer.getMetaIndexCount() == 0) {
1204      return null; // there are no meta blocks
1205    }
1206    if (metaBlockIndexReader == null) {
1207      throw new IOException(path + " meta index not loaded");
1208    }
1209
1210    byte[] mbname = Bytes.toBytes(metaBlockName);
1211    int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0, mbname.length);
1212    if (block == -1) {
1213      return null;
1214    }
1215    long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
1216
1217    // Per meta key from any given file, synchronize reads for said block. This
1218    // is OK to do for meta blocks because the meta block index is always
1219    // single-level.
1220    synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
1221      // Check cache for block. If found return.
1222      long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
1223      BlockCacheKey cacheKey =
1224        new BlockCacheKey(name, metaBlockOffset, this.isPrimaryReplicaReader(), BlockType.META);
1225
1226      cacheBlock &=
1227        cacheConf.shouldCacheBlockOnRead(BlockType.META.getCategory(), getHFileInfo(), conf);
1228      HFileBlock cachedBlock =
1229        getCachedBlock(cacheKey, cacheBlock, false, true, BlockType.META, null);
1230      if (cachedBlock != null) {
1231        assert cachedBlock.isUnpacked() : "Packed block leak.";
1232        // Return a distinct 'shallow copy' of the block,
1233        // so pos does not get messed by the scanner
1234        return cachedBlock;
1235      }
1236      // Cache Miss, please load.
1237
1238      HFileBlock compressedBlock =
1239        fsBlockReader.readBlockData(metaBlockOffset, blockSize, true, false, true);
1240      HFileBlock uncompressedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1241      if (compressedBlock != uncompressedBlock) {
1242        compressedBlock.release();
1243      }
1244
1245      // Cache the block
1246      if (cacheBlock) {
1247        cacheConf.getBlockCache().ifPresent(
1248          cache -> cache.cacheBlock(cacheKey, uncompressedBlock, cacheConf.isInMemory()));
1249      }
1250      return uncompressedBlock;
1251    }
1252  }
1253
1254  /**
1255   * Whether we use heap or not depends on our intent to cache the block. We want to avoid
1256   * allocating to off-heap if we intend to cache into the on-heap L1 cache. Otherwise, it's more
1257   * efficient to allocate to off-heap since we can control GC ourselves for those. So our decision
1258   * here breaks down as follows: <br>
1259   * If block cache is disabled, don't use heap. If we're not using the CombinedBlockCache, use heap
1260   * unless caching is disabled for the request. Otherwise, only use heap if caching is enabled and
1261   * the expected block type is not DATA (which goes to off-heap L2 in combined cache).
1262   * @see org.apache.hadoop.hbase.io.hfile.HFileBlock.FSReader#readBlockData(long, long, boolean,
1263   *      boolean, boolean)
1264   */
1265  private boolean shouldUseHeap(BlockType expectedBlockType, boolean cacheBlock) {
1266    if (!cacheConf.getBlockCache().isPresent()) {
1267      return false;
1268    }
1269
1270    // we only cache a block if cacheBlock is true and caching-on-read is enabled in CacheConfig
1271    // we can really only check for that if have an expectedBlockType
1272    if (expectedBlockType != null) {
1273      cacheBlock &= cacheConf.shouldCacheBlockOnRead(expectedBlockType.getCategory());
1274    }
1275
1276    if (!cacheConf.isCombinedBlockCache()) {
1277      // Block to cache in LruBlockCache must be an heap one, if caching enabled. So just allocate
1278      // block memory from heap for saving an extra off-heap to heap copying in that case.
1279      return cacheBlock;
1280    }
1281
1282    return cacheBlock && expectedBlockType != null && !expectedBlockType.isData();
1283  }
1284
1285  @Override
1286  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1287    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1288    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1289    return readBlock(dataBlockOffset, onDiskBlockSize, cacheBlock, pread, isCompaction,
1290      updateCacheMetrics, expectedBlockType, expectedDataBlockEncoding, false);
1291  }
1292
1293  @Override
1294  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1295    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1296    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding, boolean cacheOnly)
1297    throws IOException {
1298    if (dataBlockIndexReader == null) {
1299      throw new IOException(path + " block index not loaded");
1300    }
1301    long trailerOffset = trailer.getLoadOnOpenDataOffset();
1302    if (dataBlockOffset < 0 || dataBlockOffset >= trailerOffset) {
1303      throw new IOException("Requested block is out of range: " + dataBlockOffset
1304        + ", lastDataBlockOffset: " + trailer.getLastDataBlockOffset()
1305        + ", trailer.getLoadOnOpenDataOffset: " + trailerOffset + ", path=" + path);
1306    }
1307    // For any given block from any given file, synchronize reads for said
1308    // block.
1309    // Without a cache, this synchronizing is needless overhead, but really
1310    // the other choice is to duplicate work (which the cache would prevent you
1311    // from doing).
1312
1313    BlockCacheKey cacheKey =
1314      new BlockCacheKey(path, dataBlockOffset, this.isPrimaryReplicaReader(), expectedBlockType);
1315
1316    boolean useLock = false;
1317    IdLock.Entry lockEntry = null;
1318    final Span span = Span.current();
1319    // BlockCacheKey#toString() is quite expensive to call, so if tracing isn't enabled, don't
1320    // record
1321    Attributes attributes = span.isRecording()
1322      ? Attributes.of(BLOCK_CACHE_KEY_KEY, cacheKey.toString())
1323      : Attributes.empty();
1324    try {
1325      while (true) {
1326        // Check cache for block. If found return.
1327        if (cacheConf.shouldReadBlockFromCache(expectedBlockType) && !cacheOnly) {
1328          if (useLock) {
1329            lockEntry = offsetLock.getLockEntry(dataBlockOffset);
1330          }
1331          // Try and get the block from the block cache. If the useLock variable is true then this
1332          // is the second time through the loop and it should not be counted as a block cache miss.
1333          HFileBlock cachedBlock = getCachedBlock(cacheKey, cacheBlock, useLock, updateCacheMetrics,
1334            expectedBlockType, expectedDataBlockEncoding);
1335          if (cachedBlock != null) {
1336            if (LOG.isTraceEnabled()) {
1337              LOG.trace("Block for file {} is coming from Cache {}",
1338                Bytes.toString(cachedBlock.getHFileContext().getTableName()), cachedBlock);
1339            }
1340            span.addEvent("block cache hit", attributes);
1341            assert cachedBlock.isUnpacked() : "Packed block leak.";
1342            if (cachedBlock.getBlockType().isData()) {
1343              if (updateCacheMetrics) {
1344                HFile.DATABLOCK_READ_COUNT.increment();
1345              }
1346              // Validate encoding type for data blocks. We include encoding
1347              // type in the cache key, and we expect it to match on a cache hit.
1348              if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
1349                // Remember to release the block when in exceptional path.
1350                cacheConf.getBlockCache().ifPresent(cache -> {
1351                  returnAndEvictBlock(cache, cacheKey, cachedBlock);
1352                });
1353                throw new IOException("Cached block under key " + cacheKey + " "
1354                  + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
1355                  + dataBlockEncoder.getDataBlockEncoding() + "), path=" + path);
1356              }
1357            }
1358            // Cache-hit. Return!
1359            return cachedBlock;
1360          }
1361
1362          if (!useLock && cacheBlock && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
1363            // check cache again with lock
1364            useLock = true;
1365            continue;
1366          }
1367          // Carry on, please load.
1368        }
1369
1370        span.addEvent("block cache miss", attributes);
1371        // Load block from filesystem.
1372        HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, pread,
1373          !isCompaction, shouldUseHeap(expectedBlockType, cacheBlock));
1374        try {
1375          validateBlockType(hfileBlock, expectedBlockType);
1376        } catch (IOException e) {
1377          hfileBlock.release();
1378          throw e;
1379        }
1380        BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory();
1381        final boolean cacheCompressed = cacheConf.shouldCacheCompressed(category);
1382        final boolean cacheOnRead =
1383          cacheConf.shouldCacheBlockOnRead(category, getHFileInfo(), conf);
1384
1385        // Don't need the unpacked block back and we're storing the block in the cache compressed
1386        if (cacheOnly && cacheCompressed && cacheOnRead) {
1387          HFileBlock blockNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock);
1388          cacheConf.getBlockCache().ifPresent(cache -> {
1389            LOG.debug("Skipping decompression of block {} in prefetch", cacheKey);
1390            // Cache the block if necessary
1391            if (cacheBlock && cacheOnRead) {
1392              cache.cacheBlock(cacheKey, blockNoChecksum, cacheConf.isInMemory(), cacheOnly);
1393            }
1394          });
1395
1396          if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1397            HFile.DATABLOCK_READ_COUNT.increment();
1398          }
1399          return blockNoChecksum;
1400        }
1401        HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
1402        HFileBlock unpackedNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, unpacked);
1403        // Cache the block if necessary
1404        cacheConf.getBlockCache().ifPresent(cache -> {
1405          if (cacheBlock && cacheOnRead) {
1406            // Using the wait on cache during compaction and prefetching.
1407            cache.cacheBlock(cacheKey,
1408              cacheCompressed
1409                ? BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock)
1410                : unpackedNoChecksum,
1411              cacheConf.isInMemory(), cacheOnly);
1412          }
1413        });
1414        if (unpacked != hfileBlock) {
1415          // End of life here if hfileBlock is an independent block.
1416          hfileBlock.release();
1417        }
1418        if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1419          HFile.DATABLOCK_READ_COUNT.increment();
1420        }
1421
1422        return unpackedNoChecksum;
1423      }
1424    } finally {
1425      if (lockEntry != null) {
1426        offsetLock.releaseLockEntry(lockEntry);
1427      }
1428    }
1429  }
1430
1431  @Override
1432  public boolean hasMVCCInfo() {
1433    return fileInfo.shouldIncludeMemStoreTS() && fileInfo.isDecodeMemstoreTS();
1434  }
1435
1436  /**
1437   * Compares the actual type of a block retrieved from cache or disk with its expected type and
1438   * throws an exception in case of a mismatch. Expected block type of {@link BlockType#DATA} is
1439   * considered to match the actual block type [@link {@link BlockType#ENCODED_DATA} as well.
1440   * @param block             a block retrieved from cache or disk
1441   * @param expectedBlockType the expected block type, or null to skip the check
1442   */
1443  private void validateBlockType(HFileBlock block, BlockType expectedBlockType) throws IOException {
1444    if (expectedBlockType == null) {
1445      return;
1446    }
1447    BlockType actualBlockType = block.getBlockType();
1448    if (expectedBlockType.isData() && actualBlockType.isData()) {
1449      // We consider DATA to match ENCODED_DATA for the purpose of this
1450      // verification.
1451      return;
1452    }
1453    if (actualBlockType != expectedBlockType) {
1454      throw new IOException("Expected block type " + expectedBlockType + ", " + "but got "
1455        + actualBlockType + ": " + block + ", path=" + path);
1456    }
1457  }
1458
1459  /**
1460   * @return Last key as cell in the file. May be null if file has no entries. Note that this is not
1461   *         the last row key, but it is the Cell representation of the last key
1462   */
1463  @Override
1464  public Optional<ExtendedCell> getLastKey() {
1465    return dataBlockIndexReader.isEmpty()
1466      ? Optional.empty()
1467      : Optional.of(fileInfo.getLastKeyCell());
1468  }
1469
1470  /**
1471   * @return Midkey for this file. We work with block boundaries only so returned midkey is an
1472   *         approximation only.
1473   */
1474  @Override
1475  public Optional<ExtendedCell> midKey() throws IOException {
1476    return Optional.ofNullable(dataBlockIndexReader.midkey(this));
1477  }
1478
1479  @Override
1480  public void close() throws IOException {
1481    close(cacheConf.shouldEvictOnClose());
1482  }
1483
1484  @Override
1485  public DataBlockEncoding getEffectiveEncodingInCache(boolean isCompaction) {
1486    return dataBlockEncoder.getEffectiveEncodingInCache(isCompaction);
1487  }
1488
1489  /** For testing */
1490  @Override
1491  public HFileBlock.FSReader getUncachedBlockReader() {
1492    return fsBlockReader;
1493  }
1494
1495  /**
1496   * Scanner that operates on encoded data blocks.
1497   */
1498  protected static class EncodedScanner extends HFileScannerImpl {
1499    private final HFileBlockDecodingContext decodingCtx;
1500    private final DataBlockEncoder.EncodedSeeker seeker;
1501    private final DataBlockEncoder dataBlockEncoder;
1502
1503    public EncodedScanner(HFile.Reader reader, boolean cacheBlocks, boolean pread,
1504      boolean isCompaction, HFileContext meta, Configuration conf) {
1505      super(reader, cacheBlocks, pread, isCompaction);
1506      DataBlockEncoding encoding = reader.getDataBlockEncoding();
1507      dataBlockEncoder = encoding.getEncoder();
1508      decodingCtx = dataBlockEncoder.newDataBlockDecodingContext(conf, meta);
1509      seeker = dataBlockEncoder.createSeeker(decodingCtx);
1510    }
1511
1512    @Override
1513    public boolean isSeeked() {
1514      return curBlock != null;
1515    }
1516
1517    @Override
1518    public void setNonSeekedState() {
1519      reset();
1520    }
1521
1522    /**
1523     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1524     * key/value pair.
1525     * @param newBlock the block to make current, and read by {@link HFileReaderImpl#readBlock},
1526     *                 it's a totally new block with new allocated {@link ByteBuff}, so if no
1527     *                 further reference to this block, we should release it carefully.
1528     */
1529    @Override
1530    protected void updateCurrentBlock(HFileBlock newBlock) throws CorruptHFileException {
1531      try {
1532        // sanity checks
1533        if (newBlock.getBlockType() != BlockType.ENCODED_DATA) {
1534          throw new IllegalStateException("EncodedScanner works only on encoded data blocks");
1535        }
1536        short dataBlockEncoderId = newBlock.getDataBlockEncodingId();
1537        if (!DataBlockEncoding.isCorrectEncoder(dataBlockEncoder, dataBlockEncoderId)) {
1538          String encoderCls = dataBlockEncoder.getClass().getName();
1539          throw new CorruptHFileException(
1540            "Encoder " + encoderCls + " doesn't support data block encoding "
1541              + DataBlockEncoding.getNameFromId(dataBlockEncoderId) + ",path=" + reader.getPath());
1542        }
1543        updateCurrBlockRef(newBlock);
1544        ByteBuff encodedBuffer = getEncodedBuffer(newBlock);
1545        seeker.setCurrentBuffer(encodedBuffer);
1546      } finally {
1547        releaseIfNotCurBlock(newBlock);
1548      }
1549      // Reset the next indexed key
1550      this.nextIndexedKey = null;
1551    }
1552
1553    private ByteBuff getEncodedBuffer(HFileBlock newBlock) {
1554      ByteBuff origBlock = newBlock.getBufferReadOnly();
1555      int pos = newBlock.headerSize() + DataBlockEncoding.ID_SIZE;
1556      origBlock.position(pos);
1557      origBlock
1558        .limit(pos + newBlock.getUncompressedSizeWithoutHeader() - DataBlockEncoding.ID_SIZE);
1559      return origBlock.slice();
1560    }
1561
1562    @Override
1563    protected boolean processFirstDataBlock() throws IOException {
1564      seeker.rewind();
1565      return true;
1566    }
1567
1568    @Override
1569    public boolean next() throws IOException {
1570      boolean isValid = seeker.next();
1571      if (!isValid) {
1572        HFileBlock newBlock = readNextDataBlock();
1573        isValid = newBlock != null;
1574        if (isValid) {
1575          updateCurrentBlock(newBlock);
1576        } else {
1577          setNonSeekedState();
1578        }
1579      }
1580      return isValid;
1581    }
1582
1583    @Override
1584    public ExtendedCell getKey() {
1585      assertValidSeek();
1586      return seeker.getKey();
1587    }
1588
1589    @Override
1590    public ByteBuffer getValue() {
1591      assertValidSeek();
1592      return seeker.getValueShallowCopy();
1593    }
1594
1595    @Override
1596    public ExtendedCell getCell() {
1597      if (this.curBlock == null) {
1598        return null;
1599      }
1600      return seeker.getCell();
1601    }
1602
1603    private void assertValidSeek() {
1604      if (this.curBlock == null) {
1605        throw new NotSeekedException(reader.getPath());
1606      }
1607    }
1608
1609    @Override
1610    protected ExtendedCell getFirstKeyCellInBlock(HFileBlock curBlock) {
1611      return dataBlockEncoder.getFirstKeyCellInBlock(getEncodedBuffer(curBlock));
1612    }
1613
1614    @Override
1615    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, ExtendedCell nextIndexedKey,
1616      boolean rewind, ExtendedCell key, boolean seekBefore) throws IOException {
1617      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
1618        updateCurrentBlock(seekToBlock);
1619      } else if (rewind) {
1620        seeker.rewind();
1621      }
1622      this.nextIndexedKey = nextIndexedKey;
1623      return seeker.seekToKeyInBlock(key, seekBefore);
1624    }
1625
1626    @Override
1627    public int compareKey(CellComparator comparator, ExtendedCell key) {
1628      return seeker.compareKey(comparator, key);
1629    }
1630  }
1631
1632  /**
1633   * Returns a buffer with the Bloom filter metadata. The caller takes ownership of the buffer.
1634   */
1635  @Override
1636  public DataInput getGeneralBloomFilterMetadata() throws IOException {
1637    return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1638  }
1639
1640  @Override
1641  public DataInput getDeleteBloomFilterMetadata() throws IOException {
1642    return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1643  }
1644
1645  private DataInput getBloomFilterMetadata(BlockType blockType) throws IOException {
1646    if (
1647      blockType != BlockType.GENERAL_BLOOM_META && blockType != BlockType.DELETE_FAMILY_BLOOM_META
1648    ) {
1649      throw new RuntimeException(
1650        "Block Type: " + blockType.toString() + " is not supported, path=" + path);
1651    }
1652
1653    for (HFileBlock b : fileInfo.getLoadOnOpenBlocks()) {
1654      if (b.getBlockType() == blockType) {
1655        return b.getByteStream();
1656      }
1657    }
1658    return null;
1659  }
1660
1661  public boolean isFileInfoLoaded() {
1662    return true; // We load file info in constructor in version 2.
1663  }
1664
1665  @Override
1666  public HFileContext getFileContext() {
1667    return hfileContext;
1668  }
1669
1670  /**
1671   * Returns false if block prefetching was requested for this file and has not completed, true
1672   * otherwise
1673   */
1674  @Override
1675  public boolean prefetchComplete() {
1676    return PrefetchExecutor.isCompleted(path);
1677  }
1678
1679  /**
1680   * Returns true if block prefetching was started after waiting for specified delay, false
1681   * otherwise
1682   */
1683  @Override
1684  public boolean prefetchStarted() {
1685    return PrefetchExecutor.isPrefetchStarted();
1686  }
1687
1688  /**
1689   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1690   * {@link HFileScanner#seekTo(ExtendedCell)} to position an start the read. There is nothing to
1691   * clean up in a Scanner. Letting go of your references to the scanner is sufficient. NOTE: Do not
1692   * use this overload of getScanner for compactions. See
1693   * {@link #getScanner(Configuration, boolean, boolean, boolean)}
1694   * @param conf        Store configuration.
1695   * @param cacheBlocks True if we should cache blocks read in by this scanner.
1696   * @param pread       Use positional read rather than seek+read if true (pread is better for
1697   *                    random reads, seek+read is better scanning).
1698   * @return Scanner on this file.
1699   */
1700  @Override
1701  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread) {
1702    return getScanner(conf, cacheBlocks, pread, false);
1703  }
1704
1705  /**
1706   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1707   * {@link HFileScanner#seekTo(ExtendedCell)} to position an start the read. There is nothing to
1708   * clean up in a Scanner. Letting go of your references to the scanner is sufficient.
1709   * @param conf         Store configuration.
1710   * @param cacheBlocks  True if we should cache blocks read in by this scanner.
1711   * @param pread        Use positional read rather than seek+read if true (pread is better for
1712   *                     random reads, seek+read is better scanning).
1713   * @param isCompaction is scanner being used for a compaction?
1714   * @return Scanner on this file.
1715   */
1716  @Override
1717  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread,
1718    final boolean isCompaction) {
1719    if (dataBlockEncoder.useEncodedScanner()) {
1720      return new EncodedScanner(this, cacheBlocks, pread, isCompaction, this.hfileContext, conf);
1721    }
1722    return new HFileScannerImpl(this, cacheBlocks, pread, isCompaction);
1723  }
1724
1725  public int getMajorVersion() {
1726    return 3;
1727  }
1728
1729  @Override
1730  public void unbufferStream() {
1731    fsBlockReader.unbufferStream();
1732  }
1733}