001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.BLOCK_CACHE_KEY_KEY;
021
022import io.opentelemetry.api.common.Attributes;
023import io.opentelemetry.api.trace.Span;
024import java.io.DataInput;
025import java.io.IOException;
026import java.nio.ByteBuffer;
027import java.util.ArrayList;
028import java.util.Optional;
029import java.util.function.IntConsumer;
030import org.apache.hadoop.conf.Configurable;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.Path;
033import org.apache.hadoop.hbase.ByteBufferKeyOnlyKeyValue;
034import org.apache.hadoop.hbase.Cell;
035import org.apache.hadoop.hbase.CellComparator;
036import org.apache.hadoop.hbase.CellUtil;
037import org.apache.hadoop.hbase.ExtendedCell;
038import org.apache.hadoop.hbase.HBaseInterfaceAudience;
039import org.apache.hadoop.hbase.HConstants;
040import org.apache.hadoop.hbase.KeyValue;
041import org.apache.hadoop.hbase.PrivateCellUtil;
042import org.apache.hadoop.hbase.SizeCachedByteBufferKeyValue;
043import org.apache.hadoop.hbase.SizeCachedKeyValue;
044import org.apache.hadoop.hbase.SizeCachedNoTagsByteBufferKeyValue;
045import org.apache.hadoop.hbase.SizeCachedNoTagsKeyValue;
046import org.apache.hadoop.hbase.io.compress.Compression;
047import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
048import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
049import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
050import org.apache.hadoop.hbase.monitoring.ThreadLocalServerSideScanMetrics;
051import org.apache.hadoop.hbase.nio.ByteBuff;
052import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
053import org.apache.hadoop.hbase.util.ByteBufferUtils;
054import org.apache.hadoop.hbase.util.Bytes;
055import org.apache.hadoop.hbase.util.IdLock;
056import org.apache.hadoop.hbase.util.ObjectIntPair;
057import org.apache.hadoop.io.WritableUtils;
058import org.apache.yetus.audience.InterfaceAudience;
059import org.slf4j.Logger;
060import org.slf4j.LoggerFactory;
061
062/**
063 * Implementation that can handle all hfile versions of {@link HFile.Reader}.
064 */
065@InterfaceAudience.Private
066@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
067public abstract class HFileReaderImpl implements HFile.Reader, Configurable {
068  // This class is HFileReaderV3 + HFileReaderV2 + AbstractHFileReader all squashed together into
069  // one file. Ditto for all the HFileReader.ScannerV? implementations. I was running up against
070  // the MaxInlineLevel limit because too many tiers involved reading from an hfile. Was also hard
071  // to navigate the source code when so many classes participating in read.
072  private static final Logger LOG = LoggerFactory.getLogger(HFileReaderImpl.class);
073
074  /** Data block index reader keeping the root data index in memory */
075  protected HFileBlockIndex.CellBasedKeyBlockIndexReader dataBlockIndexReader;
076
077  /** Meta block index reader -- always single level */
078  protected HFileBlockIndex.ByteArrayKeyBlockIndexReader metaBlockIndexReader;
079
080  protected FixedFileTrailer trailer;
081
082  private final boolean primaryReplicaReader;
083
084  /**
085   * What kind of data block encoding should be used while reading, writing, and handling cache.
086   */
087  protected HFileDataBlockEncoder dataBlockEncoder = NoOpDataBlockEncoder.INSTANCE;
088
089  /** Block cache configuration. */
090  protected final CacheConfig cacheConf;
091
092  protected ReaderContext context;
093
094  protected final HFileInfo fileInfo;
095
096  /** Path of file */
097  protected final Path path;
098
099  /** File name to be used for block names */
100  protected final String name;
101
102  private Configuration conf;
103
104  protected HFileContext hfileContext;
105
106  /** Filesystem-level block reader. */
107  protected HFileBlock.FSReader fsBlockReader;
108
109  /**
110   * A "sparse lock" implementation allowing to lock on a particular block identified by offset. The
111   * purpose of this is to avoid two clients loading the same block, and have all but one client
112   * wait to get the block from the cache.
113   */
114  private IdLock offsetLock = new IdLock();
115
116  /** Minimum minor version supported by this HFile format */
117  static final int MIN_MINOR_VERSION = 0;
118
119  /** Maximum minor version supported by this HFile format */
120  // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
121  // the file. This version can read Writables version 1.
122  static final int MAX_MINOR_VERSION = 3;
123
124  /** Minor versions starting with this number have faked index key */
125  static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
126
127  /**
128   * Opens a HFile.
129   * @param context   Reader context info
130   * @param fileInfo  HFile info
131   * @param cacheConf Cache configuration.
132   * @param conf      Configuration
133   */
134  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
135  public HFileReaderImpl(ReaderContext context, HFileInfo fileInfo, CacheConfig cacheConf,
136    Configuration conf) throws IOException {
137    this.cacheConf = cacheConf;
138    this.context = context;
139    this.path = context.getFilePath();
140    this.name = path.getName();
141    this.conf = conf;
142    this.primaryReplicaReader = context.isPrimaryReplicaReader();
143    this.fileInfo = fileInfo;
144    this.trailer = fileInfo.getTrailer();
145    this.hfileContext = fileInfo.getHFileContext();
146    this.fsBlockReader =
147      new HFileBlock.FSReaderImpl(context, hfileContext, cacheConf.getByteBuffAllocator(), conf);
148    this.dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
149    fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
150    dataBlockIndexReader = fileInfo.getDataBlockIndexReader();
151    metaBlockIndexReader = fileInfo.getMetaBlockIndexReader();
152  }
153
154  @SuppressWarnings("serial")
155  public static class BlockIndexNotLoadedException extends IllegalStateException {
156    public BlockIndexNotLoadedException(Path path) {
157      // Add a message in case anyone relies on it as opposed to class name.
158      super(path + " block index not loaded");
159    }
160  }
161
162  public CacheConfig getCacheConf() {
163    return cacheConf;
164  }
165
166  private Optional<String> toStringFirstKey() {
167    return getFirstKey().map(CellUtil::getCellKeyAsString);
168  }
169
170  private Optional<String> toStringLastKey() {
171    return getLastKey().map(CellUtil::getCellKeyAsString);
172  }
173
174  @Override
175  public String toString() {
176    return "reader=" + path.toString()
177      + (!isFileInfoLoaded()
178        ? ""
179        : ", compression=" + trailer.getCompressionCodec().getName() + ", cacheConf=" + cacheConf
180          + ", firstKey=" + toStringFirstKey() + ", lastKey=" + toStringLastKey())
181      + ", avgKeyLen=" + fileInfo.getAvgKeyLen() + ", avgValueLen=" + fileInfo.getAvgValueLen()
182      + ", entries=" + trailer.getEntryCount() + ", length=" + context.getFileSize();
183  }
184
185  @Override
186  public long length() {
187    return context.getFileSize();
188  }
189
190  /**
191   * @return the first key in the file. May be null if file has no entries. Note that this is not
192   *         the first row key, but rather the byte form of the first KeyValue.
193   */
194  @Override
195  public Optional<ExtendedCell> getFirstKey() {
196    if (dataBlockIndexReader == null) {
197      throw new BlockIndexNotLoadedException(path);
198    }
199    return dataBlockIndexReader.isEmpty()
200      ? Optional.empty()
201      : Optional.of(dataBlockIndexReader.getRootBlockKey(0));
202  }
203
204  /**
205   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
206   * eliminate {@link KeyValue} here.
207   * @return the first row key, or null if the file is empty.
208   */
209  @Override
210  public Optional<byte[]> getFirstRowKey() {
211    // We have to copy the row part to form the row key alone
212    return getFirstKey().map(CellUtil::cloneRow);
213  }
214
215  /**
216   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
217   * eliminate {@link KeyValue} here.
218   * @return the last row key, or null if the file is empty.
219   */
220  @Override
221  public Optional<byte[]> getLastRowKey() {
222    // We have to copy the row part to form the row key alone
223    return getLastKey().map(CellUtil::cloneRow);
224  }
225
226  /** Returns number of KV entries in this HFile */
227  @Override
228  public long getEntries() {
229    return trailer.getEntryCount();
230  }
231
232  /** Returns comparator */
233  @Override
234  public CellComparator getComparator() {
235    return this.hfileContext.getCellComparator();
236  }
237
238  public Compression.Algorithm getCompressionAlgorithm() {
239    return trailer.getCompressionCodec();
240  }
241
242  /**
243   * @return the total heap size of data and meta block indexes in bytes. Does not take into account
244   *         non-root blocks of a multilevel data index.
245   */
246  @Override
247  public long indexSize() {
248    return (dataBlockIndexReader != null ? dataBlockIndexReader.heapSize() : 0)
249      + ((metaBlockIndexReader != null) ? metaBlockIndexReader.heapSize() : 0);
250  }
251
252  @Override
253  public String getName() {
254    return name;
255  }
256
257  @Override
258  public void setDataBlockEncoder(HFileDataBlockEncoder dataBlockEncoder) {
259    this.dataBlockEncoder = dataBlockEncoder;
260    this.fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
261  }
262
263  @Override
264  public void setDataBlockIndexReader(HFileBlockIndex.CellBasedKeyBlockIndexReader reader) {
265    this.dataBlockIndexReader = reader;
266  }
267
268  @Override
269  public HFileBlockIndex.CellBasedKeyBlockIndexReader getDataBlockIndexReader() {
270    return dataBlockIndexReader;
271  }
272
273  @Override
274  public void setMetaBlockIndexReader(HFileBlockIndex.ByteArrayKeyBlockIndexReader reader) {
275    this.metaBlockIndexReader = reader;
276  }
277
278  @Override
279  public HFileBlockIndex.ByteArrayKeyBlockIndexReader getMetaBlockIndexReader() {
280    return metaBlockIndexReader;
281  }
282
283  @Override
284  public FixedFileTrailer getTrailer() {
285    return trailer;
286  }
287
288  @Override
289  public ReaderContext getContext() {
290    return this.context;
291  }
292
293  @Override
294  public HFileInfo getHFileInfo() {
295    return this.fileInfo;
296  }
297
298  @Override
299  public boolean isPrimaryReplicaReader() {
300    return primaryReplicaReader;
301  }
302
303  /**
304   * An exception thrown when an operation requiring a scanner to be seeked is invoked on a scanner
305   * that is not seeked.
306   */
307  @SuppressWarnings("serial")
308  public static class NotSeekedException extends IllegalStateException {
309    public NotSeekedException(Path path) {
310      super(path + " not seeked to a key/value");
311    }
312  }
313
314  public static class HFileScannerImpl implements HFileScanner {
315    private ByteBuff blockBuffer;
316    protected final boolean cacheBlocks;
317    protected final boolean pread;
318    protected final boolean isCompaction;
319    private int currKeyLen;
320    private int currValueLen;
321    private int currMemstoreTSLen;
322    private long currMemstoreTS;
323    protected final HFile.Reader reader;
324    private int currTagsLen;
325    private short rowLen;
326    // buffer backed keyonlyKV
327    private ByteBufferKeyOnlyKeyValue bufBackedKeyOnlyKv = new ByteBufferKeyOnlyKeyValue();
328    // A pair for reusing in blockSeek() so that we don't garbage lot of objects
329    final ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<>();
330
331    /**
332     * The next indexed key is to keep track of the indexed key of the next data block. If the
333     * nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the current data block is the
334     * last data block. If the nextIndexedKey is null, it means the nextIndexedKey has not been
335     * loaded yet.
336     */
337    protected ExtendedCell nextIndexedKey;
338
339    // Current block being used. NOTICE: DON't release curBlock separately except in shipped() or
340    // close() methods. Because the shipped() or close() will do the release finally, even if any
341    // exception occur the curBlock will be released by the close() method (see
342    // RegionScannerImpl#handleException). Call the releaseIfNotCurBlock() to release the
343    // unreferenced block please.
344    protected HFileBlock curBlock;
345    // Whether we returned a result for curBlock's size in recordBlockSize().
346    // gets reset whenever curBlock is changed.
347    private boolean providedCurrentBlockSize = false;
348
349    public HFileBlock getCurBlock() {
350      return curBlock;
351    }
352
353    // Previous blocks that were used in the course of the read
354    protected final ArrayList<HFileBlock> prevBlocks = new ArrayList<>();
355
356    public HFileScannerImpl(final HFile.Reader reader, final boolean cacheBlocks,
357      final boolean pread, final boolean isCompaction) {
358      this.reader = reader;
359      this.cacheBlocks = cacheBlocks;
360      this.pread = pread;
361      this.isCompaction = isCompaction;
362    }
363
364    void updateCurrBlockRef(HFileBlock block) {
365      if (block != null && curBlock != null && block.getOffset() == curBlock.getOffset()) {
366        return;
367      }
368      if (this.curBlock != null && this.curBlock.isSharedMem()) {
369        prevBlocks.add(this.curBlock);
370      }
371      this.curBlock = block;
372      this.providedCurrentBlockSize = false;
373    }
374
375    void reset() {
376      // We don't have to keep ref to heap block
377      if (this.curBlock != null && this.curBlock.isSharedMem()) {
378        this.prevBlocks.add(this.curBlock);
379      }
380      this.curBlock = null;
381    }
382
383    private void returnBlocks(boolean returnAll) {
384      this.prevBlocks.forEach(HFileBlock::release);
385      this.prevBlocks.clear();
386      if (returnAll && this.curBlock != null) {
387        this.curBlock.release();
388        this.curBlock = null;
389      }
390    }
391
392    @Override
393    public boolean isSeeked() {
394      return blockBuffer != null;
395    }
396
397    @Override
398    public String toString() {
399      return "HFileScanner for reader " + String.valueOf(getReader());
400    }
401
402    protected void assertSeeked() {
403      if (!isSeeked()) {
404        throw new NotSeekedException(reader.getPath());
405      }
406    }
407
408    @Override
409    public HFile.Reader getReader() {
410      return reader;
411    }
412
413    // From non encoded HFiles, we always read back KeyValue or its descendant.(Note: When HFile
414    // block is in DBB, it will be OffheapKV). So all parts of the Cell is in a contiguous
415    // array/buffer. How many bytes we should wrap to make the KV is what this method returns.
416    private int getKVBufSize() {
417      int kvBufSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen;
418      if (currTagsLen > 0) {
419        kvBufSize += Bytes.SIZEOF_SHORT + currTagsLen;
420      }
421      return kvBufSize;
422    }
423
424    @Override
425    public void close() {
426      if (!pread) {
427        // For seek + pread stream socket should be closed when the scanner is closed. HBASE-9393
428        reader.unbufferStream();
429      }
430      this.returnBlocks(true);
431    }
432
433    @Override
434    public void recordBlockSize(IntConsumer blockSizeConsumer) {
435      if (!providedCurrentBlockSize && curBlock != null) {
436        providedCurrentBlockSize = true;
437        blockSizeConsumer.accept(curBlock.getUncompressedSizeWithoutHeader());
438      }
439    }
440
441    // Returns the #bytes in HFile for the current cell. Used to skip these many bytes in current
442    // HFile block's buffer so as to position to the next cell.
443    private int getCurCellSerializedSize() {
444      int curCellSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen + currMemstoreTSLen;
445      if (this.reader.getFileContext().isIncludesTags()) {
446        curCellSize += Bytes.SIZEOF_SHORT + currTagsLen;
447      }
448      return curCellSize;
449    }
450
451    protected void readKeyValueLen() {
452      // This is a hot method. We go out of our way to make this method short so it can be
453      // inlined and is not too big to compile. We also manage position in ByteBuffer ourselves
454      // because it is faster than going via range-checked ByteBuffer methods or going through a
455      // byte buffer array a byte at a time.
456      // Get a long at a time rather than read two individual ints. In micro-benchmarking, even
457      // with the extra bit-fiddling, this is order-of-magnitude faster than getting two ints.
458      // Trying to imitate what was done - need to profile if this is better or
459      // earlier way is better by doing mark and reset?
460      // But ensure that you read long instead of two ints
461      long ll = blockBuffer.getLongAfterPosition(0);
462      // Read top half as an int of key length and bottom int as value length
463      this.currKeyLen = (int) (ll >> Integer.SIZE);
464      this.currValueLen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
465      checkKeyValueLen();
466      this.rowLen = blockBuffer.getShortAfterPosition(Bytes.SIZEOF_LONG);
467      // Move position past the key and value lengths and then beyond the key and value
468      int p = (Bytes.SIZEOF_LONG + currKeyLen + currValueLen);
469      if (reader.getFileContext().isIncludesTags()) {
470        // Tags length is a short.
471        this.currTagsLen = blockBuffer.getShortAfterPosition(p);
472        checkTagsLen();
473        p += (Bytes.SIZEOF_SHORT + currTagsLen);
474      }
475      readMvccVersion(p);
476    }
477
478    private final void checkTagsLen() {
479      if (checkLen(this.currTagsLen)) {
480        throw new IllegalStateException(
481          "Invalid currTagsLen " + this.currTagsLen + ". Block offset: " + curBlock.getOffset()
482            + ", block length: " + this.blockBuffer.limit() + ", position: "
483            + this.blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
484      }
485    }
486
487    /**
488     * Read mvcc. Does checks to see if we even need to read the mvcc at all.
489     */
490    protected void readMvccVersion(final int offsetFromPos) {
491      // See if we even need to decode mvcc.
492      if (!this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
493        return;
494      }
495      if (!this.reader.getHFileInfo().isDecodeMemstoreTS()) {
496        currMemstoreTS = 0;
497        currMemstoreTSLen = 1;
498        return;
499      }
500      _readMvccVersion(offsetFromPos);
501    }
502
503    /**
504     * Actually do the mvcc read. Does no checks.
505     */
506    private void _readMvccVersion(int offsetFromPos) {
507      // This is Bytes#bytesToVint inlined so can save a few instructions in this hot method; i.e.
508      // previous if one-byte vint, we'd redo the vint call to find int size.
509      // Also the method is kept small so can be inlined.
510      byte firstByte = blockBuffer.getByteAfterPosition(offsetFromPos);
511      int len = WritableUtils.decodeVIntSize(firstByte);
512      if (len == 1) {
513        this.currMemstoreTS = firstByte;
514      } else {
515        int remaining = len - 1;
516        long i = 0;
517        offsetFromPos++;
518        if (remaining >= Bytes.SIZEOF_INT) {
519          // The int read has to be converted to unsigned long so the & op
520          i = (blockBuffer.getIntAfterPosition(offsetFromPos) & 0x00000000ffffffffL);
521          remaining -= Bytes.SIZEOF_INT;
522          offsetFromPos += Bytes.SIZEOF_INT;
523        }
524        if (remaining >= Bytes.SIZEOF_SHORT) {
525          short s = blockBuffer.getShortAfterPosition(offsetFromPos);
526          i = i << 16;
527          i = i | (s & 0xFFFF);
528          remaining -= Bytes.SIZEOF_SHORT;
529          offsetFromPos += Bytes.SIZEOF_SHORT;
530        }
531        for (int idx = 0; idx < remaining; idx++) {
532          byte b = blockBuffer.getByteAfterPosition(offsetFromPos + idx);
533          i = i << 8;
534          i = i | (b & 0xFF);
535        }
536        currMemstoreTS = (WritableUtils.isNegativeVInt(firstByte) ? ~i : i);
537      }
538      this.currMemstoreTSLen = len;
539    }
540
541    /**
542     * Within a loaded block, seek looking for the last key that is smaller than (or equal to?) the
543     * key we are interested in. A note on the seekBefore: if you have seekBefore = true, AND the
544     * first key in the block = key, then you'll get thrown exceptions. The caller has to check for
545     * that case and load the previous block as appropriate. the key to find find the key before the
546     * given key in case of exact match.
547     * @return 0 in case of an exact key match, 1 in case of an inexact match, -2 in case of an
548     *         inexact match and furthermore, the input key less than the first key of current
549     *         block(e.g. using a faked index key)
550     */
551    protected int blockSeek(Cell key, boolean seekBefore) {
552      int klen, vlen, tlen = 0;
553      int lastKeyValueSize = -1;
554      int offsetFromPos;
555      do {
556        offsetFromPos = 0;
557        // Better to ensure that we use the BB Utils here
558        long ll = blockBuffer.getLongAfterPosition(offsetFromPos);
559        klen = (int) (ll >> Integer.SIZE);
560        vlen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
561        if (checkKeyLen(klen) || checkLen(vlen)) {
562          throw new IllegalStateException(
563            "Invalid klen " + klen + " or vlen " + vlen + ". Block offset: " + curBlock.getOffset()
564              + ", block length: " + blockBuffer.limit() + ", position: " + blockBuffer.position()
565              + " (without header)." + " path=" + reader.getPath());
566        }
567        offsetFromPos += Bytes.SIZEOF_LONG;
568        this.rowLen = blockBuffer.getShortAfterPosition(offsetFromPos);
569        blockBuffer.asSubByteBuffer(blockBuffer.position() + offsetFromPos, klen, pair);
570        bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), klen, rowLen);
571        int comp =
572          PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, bufBackedKeyOnlyKv);
573        offsetFromPos += klen + vlen;
574        if (this.reader.getFileContext().isIncludesTags()) {
575          // Read short as unsigned, high byte first
576          tlen = ((blockBuffer.getByteAfterPosition(offsetFromPos) & 0xff) << 8)
577            ^ (blockBuffer.getByteAfterPosition(offsetFromPos + 1) & 0xff);
578          if (checkLen(tlen)) {
579            throw new IllegalStateException("Invalid tlen " + tlen + ". Block offset: "
580              + curBlock.getOffset() + ", block length: " + blockBuffer.limit() + ", position: "
581              + blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
582          }
583          // add the two bytes read for the tags.
584          offsetFromPos += tlen + (Bytes.SIZEOF_SHORT);
585        }
586        if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
587          // Directly read the mvcc based on current position
588          readMvccVersion(offsetFromPos);
589        }
590        if (comp == 0) {
591          if (seekBefore) {
592            if (lastKeyValueSize < 0) {
593              throw new IllegalStateException("blockSeek with seekBefore "
594                + "at the first key of the block: key=" + CellUtil.getCellKeyAsString(key)
595                + ", blockOffset=" + curBlock.getOffset() + ", onDiskSize="
596                + curBlock.getOnDiskSizeWithHeader() + ", path=" + reader.getPath());
597            }
598            blockBuffer.moveBack(lastKeyValueSize);
599            readKeyValueLen();
600            return 1; // non exact match.
601          }
602          currKeyLen = klen;
603          currValueLen = vlen;
604          currTagsLen = tlen;
605          return 0; // indicate exact match
606        } else if (comp < 0) {
607          if (lastKeyValueSize > 0) {
608            blockBuffer.moveBack(lastKeyValueSize);
609          }
610          readKeyValueLen();
611          if (lastKeyValueSize == -1 && blockBuffer.position() == 0) {
612            return HConstants.INDEX_KEY_MAGIC;
613          }
614          return 1;
615        }
616        // The size of this key/value tuple, including key/value length fields.
617        lastKeyValueSize = klen + vlen + currMemstoreTSLen + KEY_VALUE_LEN_SIZE;
618        // include tag length also if tags included with KV
619        if (reader.getFileContext().isIncludesTags()) {
620          lastKeyValueSize += tlen + Bytes.SIZEOF_SHORT;
621        }
622        blockBuffer.skip(lastKeyValueSize);
623      } while (blockBuffer.hasRemaining());
624
625      // Seek to the last key we successfully read. This will happen if this is
626      // the last key/value pair in the file, in which case the following call
627      // to next() has to return false.
628      blockBuffer.moveBack(lastKeyValueSize);
629      readKeyValueLen();
630      return 1; // didn't exactly find it.
631    }
632
633    @Override
634    public ExtendedCell getNextIndexedKey() {
635      return nextIndexedKey;
636    }
637
638    @Override
639    public int seekTo(ExtendedCell key) throws IOException {
640      return seekTo(key, true);
641    }
642
643    @Override
644    public int reseekTo(ExtendedCell key) throws IOException {
645      int compared;
646      if (isSeeked()) {
647        compared = compareKey(reader.getComparator(), key);
648        if (compared < 1) {
649          // If the required key is less than or equal to current key, then
650          // don't do anything.
651          return compared;
652        } else {
653          // The comparison with no_next_index_key has to be checked
654          if (
655            this.nextIndexedKey != null && (this.nextIndexedKey
656                == KeyValueScanner.NO_NEXT_INDEXED_KEY
657              || PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, nextIndexedKey)
658                  < 0)
659          ) {
660            // The reader shall continue to scan the current data block instead
661            // of querying the
662            // block index as long as it knows the target key is strictly
663            // smaller than
664            // the next indexed key or the current data block is the last data
665            // block.
666            return loadBlockAndSeekToKey(this.curBlock, nextIndexedKey, false, key, false);
667          }
668        }
669      }
670      // Don't rewind on a reseek operation, because reseek implies that we are
671      // always going forward in the file.
672      return seekTo(key, false);
673    }
674
675    /**
676     * An internal API function. Seek to the given key, optionally rewinding to the first key of the
677     * block before doing the seek.
678     * @param key    - a cell representing the key that we need to fetch
679     * @param rewind whether to rewind to the first key of the block before doing the seek. If this
680     *               is false, we are assuming we never go back, otherwise the result is undefined.
681     * @return -1 if the key is earlier than the first key of the file, 0 if we are at the given
682     *         key, 1 if we are past the given key -2 if the key is earlier than the first key of
683     *         the file while using a faked index key
684     */
685    public int seekTo(ExtendedCell key, boolean rewind) throws IOException {
686      HFileBlockIndex.BlockIndexReader indexReader = reader.getDataBlockIndexReader();
687      BlockWithScanInfo blockWithScanInfo = indexReader.loadDataBlockWithScanInfo(key, curBlock,
688        cacheBlocks, pread, isCompaction, getEffectiveDataBlockEncoding(), reader);
689      if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
690        // This happens if the key e.g. falls before the beginning of the file.
691        return -1;
692      }
693      return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
694        blockWithScanInfo.getNextIndexedKey(), rewind, key, false);
695    }
696
697    @Override
698    public boolean seekBefore(ExtendedCell key) throws IOException {
699      HFileBlock seekToBlock = reader.getDataBlockIndexReader().seekToDataBlock(key, curBlock,
700        cacheBlocks, pread, isCompaction, reader.getEffectiveEncodingInCache(isCompaction), reader);
701      if (seekToBlock == null) {
702        return false;
703      }
704      ExtendedCell firstKey = getFirstKeyCellInBlock(seekToBlock);
705      if (PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), firstKey, key) >= 0) {
706        long previousBlockOffset = seekToBlock.getPrevBlockOffset();
707        // The key we are interested in
708        if (previousBlockOffset == -1) {
709          // we have a 'problem', the key we want is the first of the file.
710          releaseIfNotCurBlock(seekToBlock);
711          return false;
712        }
713
714        // The first key in the current block 'seekToBlock' is greater than the given
715        // seekBefore key. We will go ahead by reading the next block that satisfies the
716        // given key. Return the current block before reading the next one.
717        releaseIfNotCurBlock(seekToBlock);
718        // It is important that we compute and pass onDiskSize to the block
719        // reader so that it does not have to read the header separately to
720        // figure out the size. Currently, we do not have a way to do this
721        // correctly in the general case however.
722        // TODO: See https://issues.apache.org/jira/browse/HBASE-14576
723        int prevBlockSize = -1;
724        seekToBlock = reader.readBlock(previousBlockOffset, prevBlockSize, cacheBlocks, pread,
725          isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
726        // TODO shortcut: seek forward in this block to the last key of the
727        // block.
728      }
729      loadBlockAndSeekToKey(seekToBlock, firstKey, true, key, true);
730      return true;
731    }
732
733    /**
734     * The curBlock will be released by shipping or close method, so only need to consider releasing
735     * the block, which was read from HFile before and not referenced by curBlock.
736     */
737    protected void releaseIfNotCurBlock(HFileBlock block) {
738      if (curBlock != block) {
739        block.release();
740      }
741    }
742
743    /**
744     * Scans blocks in the "scanned" section of the {@link HFile} until the next data block is
745     * found.
746     * @return the next block, or null if there are no more data blocks
747     */
748    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_NULL_ON_SOME_PATH",
749        justification = "Yeah, unnecessary null check; could do w/ clean up")
750    protected HFileBlock readNextDataBlock() throws IOException {
751      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
752      if (curBlock == null) {
753        return null;
754      }
755      HFileBlock block = this.curBlock;
756      do {
757        if (block.getOffset() >= lastDataBlockOffset) {
758          releaseIfNotCurBlock(block);
759          return null;
760        }
761        if (block.getOffset() < 0) {
762          releaseIfNotCurBlock(block);
763          throw new IOException("Invalid block offset: " + block + ", path=" + reader.getPath());
764        }
765        // We are reading the next block without block type validation, because
766        // it might turn out to be a non-data block.
767        block = reader.readBlock(block.getOffset() + block.getOnDiskSizeWithHeader(),
768          block.getNextBlockOnDiskSize(), cacheBlocks, pread, isCompaction, true, null,
769          getEffectiveDataBlockEncoding());
770        if (block != null && !block.getBlockType().isData()) {
771          // Whatever block we read we will be returning it unless
772          // it is a datablock. Just in case the blocks are non data blocks
773          block.release();
774        }
775      } while (!block.getBlockType().isData());
776      return block;
777    }
778
779    public DataBlockEncoding getEffectiveDataBlockEncoding() {
780      return this.reader.getEffectiveEncodingInCache(isCompaction);
781    }
782
783    @Override
784    public ExtendedCell getCell() {
785      if (!isSeeked()) {
786        return null;
787      }
788
789      ExtendedCell ret;
790      int cellBufSize = getKVBufSize();
791      long seqId = 0L;
792      if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
793        seqId = currMemstoreTS;
794      }
795      if (blockBuffer.hasArray()) {
796        // TODO : reduce the varieties of KV here. Check if based on a boolean
797        // we can handle the 'no tags' case.
798        if (currTagsLen > 0) {
799          ret = new SizeCachedKeyValue(blockBuffer.array(),
800            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
801            rowLen);
802        } else {
803          ret = new SizeCachedNoTagsKeyValue(blockBuffer.array(),
804            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
805            rowLen);
806        }
807      } else {
808        ByteBuffer buf = blockBuffer.asSubByteBuffer(cellBufSize);
809        if (buf.isDirect()) {
810          ret = currTagsLen > 0
811            ? new SizeCachedByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId, currKeyLen,
812              rowLen)
813            : new SizeCachedNoTagsByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId,
814              currKeyLen, rowLen);
815        } else {
816          if (currTagsLen > 0) {
817            ret = new SizeCachedKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
818              cellBufSize, seqId, currKeyLen, rowLen);
819          } else {
820            ret = new SizeCachedNoTagsKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
821              cellBufSize, seqId, currKeyLen, rowLen);
822          }
823        }
824      }
825      return ret;
826    }
827
828    @Override
829    public ExtendedCell getKey() {
830      assertSeeked();
831      // Create a new object so that this getKey is cached as firstKey, lastKey
832      ObjectIntPair<ByteBuffer> keyPair = new ObjectIntPair<>();
833      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, keyPair);
834      ByteBuffer keyBuf = keyPair.getFirst();
835      if (keyBuf.hasArray()) {
836        return new KeyValue.KeyOnlyKeyValue(keyBuf.array(),
837          keyBuf.arrayOffset() + keyPair.getSecond(), currKeyLen);
838      } else {
839        // Better to do a copy here instead of holding on to this BB so that
840        // we could release the blocks referring to this key. This key is specifically used
841        // in HalfStoreFileReader to get the firstkey and lastkey by creating a new scanner
842        // every time. So holding onto the BB (incase of DBB) is not advised here.
843        byte[] key = new byte[currKeyLen];
844        ByteBufferUtils.copyFromBufferToArray(key, keyBuf, keyPair.getSecond(), 0, currKeyLen);
845        return new KeyValue.KeyOnlyKeyValue(key, 0, currKeyLen);
846      }
847    }
848
849    @Override
850    public ByteBuffer getValue() {
851      assertSeeked();
852      // Okie to create new Pair. Not used in hot path
853      ObjectIntPair<ByteBuffer> valuePair = new ObjectIntPair<>();
854      this.blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
855        currValueLen, valuePair);
856      ByteBuffer valBuf = valuePair.getFirst().duplicate();
857      valBuf.position(valuePair.getSecond());
858      valBuf.limit(currValueLen + valuePair.getSecond());
859      return valBuf.slice();
860    }
861
862    protected void setNonSeekedState() {
863      reset();
864      blockBuffer = null;
865      currKeyLen = 0;
866      currValueLen = 0;
867      currMemstoreTS = 0;
868      currMemstoreTSLen = 0;
869      currTagsLen = 0;
870    }
871
872    /**
873     * Set the position on current backing blockBuffer.
874     */
875    private void positionThisBlockBuffer() {
876      try {
877        blockBuffer.skip(getCurCellSerializedSize());
878      } catch (IllegalArgumentException e) {
879        LOG.error("Current pos = " + blockBuffer.position() + "; currKeyLen = " + currKeyLen
880          + "; currValLen = " + currValueLen + "; block limit = " + blockBuffer.limit()
881          + "; currBlock currBlockOffset = " + this.curBlock.getOffset() + "; path="
882          + reader.getPath());
883        throw e;
884      }
885    }
886
887    /**
888     * Set our selves up for the next 'next' invocation, set up next block.
889     * @return True is more to read else false if at the end.
890     */
891    private boolean positionForNextBlock() throws IOException {
892      // Methods are small so they get inlined because they are 'hot'.
893      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
894      if (this.curBlock.getOffset() >= lastDataBlockOffset) {
895        setNonSeekedState();
896        return false;
897      }
898      return isNextBlock();
899    }
900
901    private boolean isNextBlock() throws IOException {
902      // Methods are small so they get inlined because they are 'hot'.
903      HFileBlock nextBlock = readNextDataBlock();
904      if (nextBlock == null) {
905        setNonSeekedState();
906        return false;
907      }
908      updateCurrentBlock(nextBlock);
909      return true;
910    }
911
912    private final boolean _next() throws IOException {
913      // Small method so can be inlined. It is a hot one.
914      if (blockBuffer.remaining() <= 0) {
915        return positionForNextBlock();
916      }
917
918      // We are still in the same block.
919      readKeyValueLen();
920      return true;
921    }
922
923    /**
924     * Go to the next key/value in the block section. Loads the next block if necessary. If
925     * successful, {@link #getKey()} and {@link #getValue()} can be called.
926     * @return true if successfully navigated to the next key/value
927     */
928    @Override
929    public boolean next() throws IOException {
930      // This is a hot method so extreme measures taken to ensure it is small and inlineable.
931      // Checked by setting: -XX:+UnlockDiagnosticVMOptions -XX:+PrintInlining -XX:+PrintCompilation
932      assertSeeked();
933      positionThisBlockBuffer();
934      return _next();
935    }
936
937    /**
938     * Positions this scanner at the start of the file.
939     * @return false if empty file; i.e. a call to next would return false and the current key and
940     *         value are undefined.
941     */
942    @Override
943    public boolean seekTo() throws IOException {
944      if (reader == null) {
945        return false;
946      }
947
948      if (reader.getTrailer().getEntryCount() == 0) {
949        // No data blocks.
950        return false;
951      }
952
953      long firstDataBlockOffset = reader.getTrailer().getFirstDataBlockOffset();
954      if (curBlock != null && curBlock.getOffset() == firstDataBlockOffset) {
955        return processFirstDataBlock();
956      }
957
958      readAndUpdateNewBlock(firstDataBlockOffset);
959      return true;
960    }
961
962    protected boolean processFirstDataBlock() throws IOException {
963      blockBuffer.rewind();
964      readKeyValueLen();
965      return true;
966    }
967
968    protected void readAndUpdateNewBlock(long firstDataBlockOffset) throws IOException {
969      HFileBlock newBlock = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
970        isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
971      if (newBlock.getOffset() < 0) {
972        releaseIfNotCurBlock(newBlock);
973        throw new IOException(
974          "Invalid offset=" + newBlock.getOffset() + ", path=" + reader.getPath());
975      }
976      updateCurrentBlock(newBlock);
977    }
978
979    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, ExtendedCell nextIndexedKey,
980      boolean rewind, ExtendedCell key, boolean seekBefore) throws IOException {
981      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
982        updateCurrentBlock(seekToBlock);
983      } else if (rewind) {
984        blockBuffer.rewind();
985      }
986      // Update the nextIndexedKey
987      this.nextIndexedKey = nextIndexedKey;
988      return blockSeek(key, seekBefore);
989    }
990
991    /** Returns True if v &lt;= 0 or v &gt; current block buffer limit. */
992    protected final boolean checkKeyLen(final int v) {
993      return v <= 0 || v > this.blockBuffer.limit();
994    }
995
996    /** Returns True if v &lt; 0 or v &gt; current block buffer limit. */
997    protected final boolean checkLen(final int v) {
998      return v < 0 || v > this.blockBuffer.limit();
999    }
1000
1001    /**
1002     * Check key and value lengths are wholesome.
1003     */
1004    protected final void checkKeyValueLen() {
1005      if (checkKeyLen(this.currKeyLen) || checkLen(this.currValueLen)) {
1006        throw new IllegalStateException("Invalid currKeyLen " + this.currKeyLen
1007          + " or currValueLen " + this.currValueLen + ". Block offset: " + this.curBlock.getOffset()
1008          + ", block length: " + this.blockBuffer.limit() + ", position: "
1009          + this.blockBuffer.position() + " (without header)." + ", path=" + reader.getPath());
1010      }
1011    }
1012
1013    /**
1014     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1015     * key/value pair.
1016     * @param newBlock the block read by {@link HFileReaderImpl#readBlock}, it's a totally new block
1017     *                 with new allocated {@link ByteBuff}, so if no further reference to this
1018     *                 block, we should release it carefully.
1019     */
1020    protected void updateCurrentBlock(HFileBlock newBlock) throws IOException {
1021      try {
1022        if (newBlock.getBlockType() != BlockType.DATA) {
1023          throw new IllegalStateException(
1024            "ScannerV2 works only on data blocks, got " + newBlock.getBlockType() + "; "
1025              + "HFileName=" + reader.getPath() + ", " + "dataBlockEncoder="
1026              + reader.getDataBlockEncoding() + ", " + "isCompaction=" + isCompaction);
1027        }
1028        updateCurrBlockRef(newBlock);
1029        blockBuffer = newBlock.getBufferWithoutHeader();
1030        readKeyValueLen();
1031      } finally {
1032        releaseIfNotCurBlock(newBlock);
1033      }
1034      // Reset the next indexed key
1035      this.nextIndexedKey = null;
1036    }
1037
1038    protected ExtendedCell getFirstKeyCellInBlock(HFileBlock curBlock) {
1039      ByteBuff buffer = curBlock.getBufferWithoutHeader();
1040      // It is safe to manipulate this buffer because we own the buffer object.
1041      buffer.rewind();
1042      int klen = buffer.getInt();
1043      buffer.skip(Bytes.SIZEOF_INT);// Skip value len part
1044      ByteBuffer keyBuff = buffer.asSubByteBuffer(klen);
1045      if (keyBuff.hasArray()) {
1046        return new KeyValue.KeyOnlyKeyValue(keyBuff.array(),
1047          keyBuff.arrayOffset() + keyBuff.position(), klen);
1048      } else {
1049        return new ByteBufferKeyOnlyKeyValue(keyBuff, keyBuff.position(), klen);
1050      }
1051    }
1052
1053    public int compareKey(CellComparator comparator, ExtendedCell key) {
1054      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, pair);
1055      this.bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), currKeyLen, rowLen);
1056      return PrivateCellUtil.compareKeyIgnoresMvcc(comparator, key, this.bufBackedKeyOnlyKv);
1057    }
1058
1059    @Override
1060    public void shipped() throws IOException {
1061      this.returnBlocks(false);
1062    }
1063  }
1064
1065  @Override
1066  public Path getPath() {
1067    return path;
1068  }
1069
1070  @Override
1071  public DataBlockEncoding getDataBlockEncoding() {
1072    return dataBlockEncoder.getDataBlockEncoding();
1073  }
1074
1075  @Override
1076  public Configuration getConf() {
1077    return conf;
1078  }
1079
1080  @Override
1081  public void setConf(Configuration conf) {
1082    this.conf = conf;
1083  }
1084
1085  /** Minor versions in HFile starting with this number have hbase checksums */
1086  public static final int MINOR_VERSION_WITH_CHECKSUM = 1;
1087  /** In HFile minor version that does not support checksums */
1088  public static final int MINOR_VERSION_NO_CHECKSUM = 0;
1089
1090  /** HFile minor version that introduced pbuf filetrailer */
1091  public static final int PBUF_TRAILER_MINOR_VERSION = 2;
1092
1093  /**
1094   * The size of a (key length, value length) tuple that prefixes each entry in a data block.
1095   */
1096  public final static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
1097
1098  /**
1099   * Retrieve block from cache. Validates the retrieved block's type vs {@code expectedBlockType}
1100   * and its encoding vs. {@code expectedDataBlockEncoding}. Unpacks the block as necessary.
1101   */
1102  @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.UNITTEST)
1103  public HFileBlock getCachedBlock(BlockCacheKey cacheKey, boolean cacheBlock, boolean useLock,
1104    boolean updateCacheMetrics, BlockType expectedBlockType,
1105    DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1106    // Check cache for block. If found return.
1107    BlockCache cache = cacheConf.getBlockCache().orElse(null);
1108    long cachedBlockBytesRead = 0;
1109    if (cache != null) {
1110      HFileBlock cachedBlock = null;
1111      boolean isScanMetricsEnabled = ThreadLocalServerSideScanMetrics.isScanMetricsEnabled();
1112      try {
1113        cachedBlock = (HFileBlock) cache.getBlock(cacheKey, cacheBlock, useLock, updateCacheMetrics,
1114          expectedBlockType);
1115        if (cachedBlock != null) {
1116          if (cacheConf.shouldCacheCompressed(cachedBlock.getBlockType().getCategory())) {
1117            HFileBlock compressedBlock = cachedBlock;
1118            cachedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1119            // In case of compressed block after unpacking we can release the compressed block
1120            if (compressedBlock != cachedBlock) {
1121              compressedBlock.release();
1122            }
1123          }
1124          try {
1125            validateBlockType(cachedBlock, expectedBlockType);
1126          } catch (IOException e) {
1127            returnAndEvictBlock(cache, cacheKey, cachedBlock);
1128            cachedBlock = null;
1129            throw e;
1130          }
1131
1132          if (expectedDataBlockEncoding == null) {
1133            return cachedBlock;
1134          }
1135          DataBlockEncoding actualDataBlockEncoding = cachedBlock.getDataBlockEncoding();
1136          // Block types other than data blocks always have
1137          // DataBlockEncoding.NONE. To avoid false negative cache misses, only
1138          // perform this check if cached block is a data block.
1139          if (
1140            cachedBlock.getBlockType().isData()
1141              && !actualDataBlockEncoding.equals(expectedDataBlockEncoding)
1142          ) {
1143            // This mismatch may happen if a Scanner, which is used for say a
1144            // compaction, tries to read an encoded block from the block cache.
1145            // The reverse might happen when an EncodedScanner tries to read
1146            // un-encoded blocks which were cached earlier.
1147            //
1148            // Because returning a data block with an implicit BlockType mismatch
1149            // will cause the requesting scanner to throw a disk read should be
1150            // forced here. This will potentially cause a significant number of
1151            // cache misses, so update so we should keep track of this as it might
1152            // justify the work on a CompoundScanner.
1153            if (
1154              !expectedDataBlockEncoding.equals(DataBlockEncoding.NONE)
1155                && !actualDataBlockEncoding.equals(DataBlockEncoding.NONE)
1156            ) {
1157              // If the block is encoded but the encoding does not match the
1158              // expected encoding it is likely the encoding was changed but the
1159              // block was not yet evicted. Evictions on file close happen async
1160              // so blocks with the old encoding still linger in cache for some
1161              // period of time. This event should be rare as it only happens on
1162              // schema definition change.
1163              LOG.info(
1164                "Evicting cached block with key {} because data block encoding mismatch; "
1165                  + "expected {}, actual {}, path={}",
1166                cacheKey, actualDataBlockEncoding, expectedDataBlockEncoding, path);
1167              // This is an error scenario. so here we need to release the block.
1168              returnAndEvictBlock(cache, cacheKey, cachedBlock);
1169            }
1170            cachedBlock = null;
1171            return null;
1172          }
1173          return cachedBlock;
1174        }
1175      } finally {
1176        // Count bytes read as cached block is being returned
1177        if (isScanMetricsEnabled && cachedBlock != null) {
1178          cachedBlockBytesRead = cachedBlock.getOnDiskSizeWithHeader();
1179          // Account for the header size of the next block if it exists
1180          if (cachedBlock.getNextBlockOnDiskSize() > 0) {
1181            cachedBlockBytesRead += cachedBlock.headerSize();
1182          }
1183        }
1184        if (cachedBlockBytesRead > 0) {
1185          ThreadLocalServerSideScanMetrics.addBytesReadFromBlockCache(cachedBlockBytesRead);
1186        }
1187      }
1188    }
1189    return null;
1190  }
1191
1192  private void returnAndEvictBlock(BlockCache cache, BlockCacheKey cacheKey, Cacheable block) {
1193    block.release();
1194    cache.evictBlock(cacheKey);
1195  }
1196
1197  /**
1198   * @param cacheBlock Add block to cache, if found
1199   * @return block wrapped in a ByteBuffer, with header skipped
1200   */
1201  @Override
1202  public HFileBlock getMetaBlock(String metaBlockName, boolean cacheBlock) throws IOException {
1203    if (trailer.getMetaIndexCount() == 0) {
1204      return null; // there are no meta blocks
1205    }
1206    if (metaBlockIndexReader == null) {
1207      throw new IOException(path + " meta index not loaded");
1208    }
1209
1210    byte[] mbname = Bytes.toBytes(metaBlockName);
1211    int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0, mbname.length);
1212    if (block == -1) {
1213      return null;
1214    }
1215    long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
1216
1217    // Per meta key from any given file, synchronize reads for said block. This
1218    // is OK to do for meta blocks because the meta block index is always
1219    // single-level.
1220    synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
1221      // Check cache for block. If found return.
1222      long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
1223      BlockCacheKey cacheKey =
1224        new BlockCacheKey(name, metaBlockOffset, this.isPrimaryReplicaReader(), BlockType.META);
1225
1226      cacheBlock &= cacheConf.shouldCacheBlockOnRead(BlockType.META.getCategory());
1227      HFileBlock cachedBlock =
1228        getCachedBlock(cacheKey, cacheBlock, false, true, BlockType.META, null);
1229      if (cachedBlock != null) {
1230        assert cachedBlock.isUnpacked() : "Packed block leak.";
1231        // Return a distinct 'shallow copy' of the block,
1232        // so pos does not get messed by the scanner
1233        return cachedBlock;
1234      }
1235      // Cache Miss, please load.
1236
1237      HFileBlock compressedBlock =
1238        fsBlockReader.readBlockData(metaBlockOffset, blockSize, true, false, true);
1239      HFileBlock uncompressedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1240      if (compressedBlock != uncompressedBlock) {
1241        compressedBlock.release();
1242      }
1243
1244      // Cache the block
1245      if (cacheBlock) {
1246        cacheConf.getBlockCache().ifPresent(
1247          cache -> cache.cacheBlock(cacheKey, uncompressedBlock, cacheConf.isInMemory()));
1248      }
1249      return uncompressedBlock;
1250    }
1251  }
1252
1253  /**
1254   * Whether we use heap or not depends on our intent to cache the block. We want to avoid
1255   * allocating to off-heap if we intend to cache into the on-heap L1 cache. Otherwise, it's more
1256   * efficient to allocate to off-heap since we can control GC ourselves for those. So our decision
1257   * here breaks down as follows: <br>
1258   * If block cache is disabled, don't use heap. If we're not using the CombinedBlockCache, use heap
1259   * unless caching is disabled for the request. Otherwise, only use heap if caching is enabled and
1260   * the expected block type is not DATA (which goes to off-heap L2 in combined cache).
1261   * @see org.apache.hadoop.hbase.io.hfile.HFileBlock.FSReader#readBlockData(long, long, boolean,
1262   *      boolean, boolean)
1263   */
1264  private boolean shouldUseHeap(BlockType expectedBlockType, boolean cacheBlock) {
1265    if (!cacheConf.getBlockCache().isPresent()) {
1266      return false;
1267    }
1268
1269    // we only cache a block if cacheBlock is true and caching-on-read is enabled in CacheConfig
1270    // we can really only check for that if have an expectedBlockType
1271    if (expectedBlockType != null) {
1272      cacheBlock &= cacheConf.shouldCacheBlockOnRead(expectedBlockType.getCategory());
1273    }
1274
1275    if (!cacheConf.isCombinedBlockCache()) {
1276      // Block to cache in LruBlockCache must be an heap one, if caching enabled. So just allocate
1277      // block memory from heap for saving an extra off-heap to heap copying in that case.
1278      return cacheBlock;
1279    }
1280
1281    return cacheBlock && expectedBlockType != null && !expectedBlockType.isData();
1282  }
1283
1284  @Override
1285  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1286    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1287    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1288    return readBlock(dataBlockOffset, onDiskBlockSize, cacheBlock, pread, isCompaction,
1289      updateCacheMetrics, expectedBlockType, expectedDataBlockEncoding, false);
1290  }
1291
1292  @Override
1293  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1294    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1295    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding, boolean cacheOnly)
1296    throws IOException {
1297    if (dataBlockIndexReader == null) {
1298      throw new IOException(path + " block index not loaded");
1299    }
1300    long trailerOffset = trailer.getLoadOnOpenDataOffset();
1301    if (dataBlockOffset < 0 || dataBlockOffset >= trailerOffset) {
1302      throw new IOException("Requested block is out of range: " + dataBlockOffset
1303        + ", lastDataBlockOffset: " + trailer.getLastDataBlockOffset()
1304        + ", trailer.getLoadOnOpenDataOffset: " + trailerOffset + ", path=" + path);
1305    }
1306    // For any given block from any given file, synchronize reads for said
1307    // block.
1308    // Without a cache, this synchronizing is needless overhead, but really
1309    // the other choice is to duplicate work (which the cache would prevent you
1310    // from doing).
1311
1312    BlockCacheKey cacheKey =
1313      new BlockCacheKey(path, dataBlockOffset, this.isPrimaryReplicaReader(), expectedBlockType);
1314
1315    boolean useLock = false;
1316    IdLock.Entry lockEntry = null;
1317    final Span span = Span.current();
1318    // BlockCacheKey#toString() is quite expensive to call, so if tracing isn't enabled, don't
1319    // record
1320    Attributes attributes = span.isRecording()
1321      ? Attributes.of(BLOCK_CACHE_KEY_KEY, cacheKey.toString())
1322      : Attributes.empty();
1323    try {
1324      while (true) {
1325        // Check cache for block. If found return.
1326        if (cacheConf.shouldReadBlockFromCache(expectedBlockType) && !cacheOnly) {
1327          if (useLock) {
1328            lockEntry = offsetLock.getLockEntry(dataBlockOffset);
1329          }
1330          // Try and get the block from the block cache. If the useLock variable is true then this
1331          // is the second time through the loop and it should not be counted as a block cache miss.
1332          HFileBlock cachedBlock = getCachedBlock(cacheKey, cacheBlock, useLock, updateCacheMetrics,
1333            expectedBlockType, expectedDataBlockEncoding);
1334          if (cachedBlock != null) {
1335            if (LOG.isTraceEnabled()) {
1336              LOG.trace("Block for file {} is coming from Cache {}",
1337                Bytes.toString(cachedBlock.getHFileContext().getTableName()), cachedBlock);
1338            }
1339            span.addEvent("block cache hit", attributes);
1340            assert cachedBlock.isUnpacked() : "Packed block leak.";
1341            if (cachedBlock.getBlockType().isData()) {
1342              if (updateCacheMetrics) {
1343                HFile.DATABLOCK_READ_COUNT.increment();
1344              }
1345              // Validate encoding type for data blocks. We include encoding
1346              // type in the cache key, and we expect it to match on a cache hit.
1347              if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
1348                // Remember to release the block when in exceptional path.
1349                cacheConf.getBlockCache().ifPresent(cache -> {
1350                  returnAndEvictBlock(cache, cacheKey, cachedBlock);
1351                });
1352                throw new IOException("Cached block under key " + cacheKey + " "
1353                  + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
1354                  + dataBlockEncoder.getDataBlockEncoding() + "), path=" + path);
1355              }
1356            }
1357            // Cache-hit. Return!
1358            return cachedBlock;
1359          }
1360
1361          if (!useLock && cacheBlock && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
1362            // check cache again with lock
1363            useLock = true;
1364            continue;
1365          }
1366          // Carry on, please load.
1367        }
1368
1369        span.addEvent("block cache miss", attributes);
1370        // Load block from filesystem.
1371        HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, pread,
1372          !isCompaction, shouldUseHeap(expectedBlockType, cacheBlock));
1373        try {
1374          validateBlockType(hfileBlock, expectedBlockType);
1375        } catch (IOException e) {
1376          hfileBlock.release();
1377          throw e;
1378        }
1379        BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory();
1380        final boolean cacheCompressed = cacheConf.shouldCacheCompressed(category);
1381        final boolean cacheOnRead = cacheConf.shouldCacheBlockOnRead(category);
1382
1383        // Don't need the unpacked block back and we're storing the block in the cache compressed
1384        if (cacheOnly && cacheCompressed && cacheOnRead) {
1385          HFileBlock blockNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock);
1386          cacheConf.getBlockCache().ifPresent(cache -> {
1387            LOG.debug("Skipping decompression of block {} in prefetch", cacheKey);
1388            // Cache the block if necessary
1389            if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
1390              cache.cacheBlock(cacheKey, blockNoChecksum, cacheConf.isInMemory(), cacheOnly);
1391            }
1392          });
1393
1394          if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1395            HFile.DATABLOCK_READ_COUNT.increment();
1396          }
1397          return blockNoChecksum;
1398        }
1399        HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
1400        HFileBlock unpackedNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, unpacked);
1401        // Cache the block if necessary
1402        cacheConf.getBlockCache().ifPresent(cache -> {
1403          if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
1404            // Using the wait on cache during compaction and prefetching.
1405            cache.cacheBlock(cacheKey,
1406              cacheCompressed
1407                ? BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock)
1408                : unpackedNoChecksum,
1409              cacheConf.isInMemory(), cacheOnly);
1410          }
1411        });
1412        if (unpacked != hfileBlock) {
1413          // End of life here if hfileBlock is an independent block.
1414          hfileBlock.release();
1415        }
1416        if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1417          HFile.DATABLOCK_READ_COUNT.increment();
1418        }
1419
1420        return unpackedNoChecksum;
1421      }
1422    } finally {
1423      if (lockEntry != null) {
1424        offsetLock.releaseLockEntry(lockEntry);
1425      }
1426    }
1427  }
1428
1429  @Override
1430  public boolean hasMVCCInfo() {
1431    return fileInfo.shouldIncludeMemStoreTS() && fileInfo.isDecodeMemstoreTS();
1432  }
1433
1434  /**
1435   * Compares the actual type of a block retrieved from cache or disk with its expected type and
1436   * throws an exception in case of a mismatch. Expected block type of {@link BlockType#DATA} is
1437   * considered to match the actual block type [@link {@link BlockType#ENCODED_DATA} as well.
1438   * @param block             a block retrieved from cache or disk
1439   * @param expectedBlockType the expected block type, or null to skip the check
1440   */
1441  private void validateBlockType(HFileBlock block, BlockType expectedBlockType) throws IOException {
1442    if (expectedBlockType == null) {
1443      return;
1444    }
1445    BlockType actualBlockType = block.getBlockType();
1446    if (expectedBlockType.isData() && actualBlockType.isData()) {
1447      // We consider DATA to match ENCODED_DATA for the purpose of this
1448      // verification.
1449      return;
1450    }
1451    if (actualBlockType != expectedBlockType) {
1452      throw new IOException("Expected block type " + expectedBlockType + ", " + "but got "
1453        + actualBlockType + ": " + block + ", path=" + path);
1454    }
1455  }
1456
1457  /**
1458   * @return Last key as cell in the file. May be null if file has no entries. Note that this is not
1459   *         the last row key, but it is the Cell representation of the last key
1460   */
1461  @Override
1462  public Optional<ExtendedCell> getLastKey() {
1463    return dataBlockIndexReader.isEmpty()
1464      ? Optional.empty()
1465      : Optional.of(fileInfo.getLastKeyCell());
1466  }
1467
1468  /**
1469   * @return Midkey for this file. We work with block boundaries only so returned midkey is an
1470   *         approximation only.
1471   */
1472  @Override
1473  public Optional<ExtendedCell> midKey() throws IOException {
1474    return Optional.ofNullable(dataBlockIndexReader.midkey(this));
1475  }
1476
1477  @Override
1478  public void close() throws IOException {
1479    close(cacheConf.shouldEvictOnClose());
1480  }
1481
1482  @Override
1483  public DataBlockEncoding getEffectiveEncodingInCache(boolean isCompaction) {
1484    return dataBlockEncoder.getEffectiveEncodingInCache(isCompaction);
1485  }
1486
1487  /** For testing */
1488  @Override
1489  public HFileBlock.FSReader getUncachedBlockReader() {
1490    return fsBlockReader;
1491  }
1492
1493  /**
1494   * Scanner that operates on encoded data blocks.
1495   */
1496  protected static class EncodedScanner extends HFileScannerImpl {
1497    private final HFileBlockDecodingContext decodingCtx;
1498    private final DataBlockEncoder.EncodedSeeker seeker;
1499    private final DataBlockEncoder dataBlockEncoder;
1500
1501    public EncodedScanner(HFile.Reader reader, boolean cacheBlocks, boolean pread,
1502      boolean isCompaction, HFileContext meta, Configuration conf) {
1503      super(reader, cacheBlocks, pread, isCompaction);
1504      DataBlockEncoding encoding = reader.getDataBlockEncoding();
1505      dataBlockEncoder = encoding.getEncoder();
1506      decodingCtx = dataBlockEncoder.newDataBlockDecodingContext(conf, meta);
1507      seeker = dataBlockEncoder.createSeeker(decodingCtx);
1508    }
1509
1510    @Override
1511    public boolean isSeeked() {
1512      return curBlock != null;
1513    }
1514
1515    @Override
1516    public void setNonSeekedState() {
1517      reset();
1518    }
1519
1520    /**
1521     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1522     * key/value pair.
1523     * @param newBlock the block to make current, and read by {@link HFileReaderImpl#readBlock},
1524     *                 it's a totally new block with new allocated {@link ByteBuff}, so if no
1525     *                 further reference to this block, we should release it carefully.
1526     */
1527    @Override
1528    protected void updateCurrentBlock(HFileBlock newBlock) throws CorruptHFileException {
1529      try {
1530        // sanity checks
1531        if (newBlock.getBlockType() != BlockType.ENCODED_DATA) {
1532          throw new IllegalStateException("EncodedScanner works only on encoded data blocks");
1533        }
1534        short dataBlockEncoderId = newBlock.getDataBlockEncodingId();
1535        if (!DataBlockEncoding.isCorrectEncoder(dataBlockEncoder, dataBlockEncoderId)) {
1536          String encoderCls = dataBlockEncoder.getClass().getName();
1537          throw new CorruptHFileException(
1538            "Encoder " + encoderCls + " doesn't support data block encoding "
1539              + DataBlockEncoding.getNameFromId(dataBlockEncoderId) + ",path=" + reader.getPath());
1540        }
1541        updateCurrBlockRef(newBlock);
1542        ByteBuff encodedBuffer = getEncodedBuffer(newBlock);
1543        seeker.setCurrentBuffer(encodedBuffer);
1544      } finally {
1545        releaseIfNotCurBlock(newBlock);
1546      }
1547      // Reset the next indexed key
1548      this.nextIndexedKey = null;
1549    }
1550
1551    private ByteBuff getEncodedBuffer(HFileBlock newBlock) {
1552      ByteBuff origBlock = newBlock.getBufferReadOnly();
1553      int pos = newBlock.headerSize() + DataBlockEncoding.ID_SIZE;
1554      origBlock.position(pos);
1555      origBlock
1556        .limit(pos + newBlock.getUncompressedSizeWithoutHeader() - DataBlockEncoding.ID_SIZE);
1557      return origBlock.slice();
1558    }
1559
1560    @Override
1561    protected boolean processFirstDataBlock() throws IOException {
1562      seeker.rewind();
1563      return true;
1564    }
1565
1566    @Override
1567    public boolean next() throws IOException {
1568      boolean isValid = seeker.next();
1569      if (!isValid) {
1570        HFileBlock newBlock = readNextDataBlock();
1571        isValid = newBlock != null;
1572        if (isValid) {
1573          updateCurrentBlock(newBlock);
1574        } else {
1575          setNonSeekedState();
1576        }
1577      }
1578      return isValid;
1579    }
1580
1581    @Override
1582    public ExtendedCell getKey() {
1583      assertValidSeek();
1584      return seeker.getKey();
1585    }
1586
1587    @Override
1588    public ByteBuffer getValue() {
1589      assertValidSeek();
1590      return seeker.getValueShallowCopy();
1591    }
1592
1593    @Override
1594    public ExtendedCell getCell() {
1595      if (this.curBlock == null) {
1596        return null;
1597      }
1598      return seeker.getCell();
1599    }
1600
1601    private void assertValidSeek() {
1602      if (this.curBlock == null) {
1603        throw new NotSeekedException(reader.getPath());
1604      }
1605    }
1606
1607    @Override
1608    protected ExtendedCell getFirstKeyCellInBlock(HFileBlock curBlock) {
1609      return dataBlockEncoder.getFirstKeyCellInBlock(getEncodedBuffer(curBlock));
1610    }
1611
1612    @Override
1613    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, ExtendedCell nextIndexedKey,
1614      boolean rewind, ExtendedCell key, boolean seekBefore) throws IOException {
1615      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
1616        updateCurrentBlock(seekToBlock);
1617      } else if (rewind) {
1618        seeker.rewind();
1619      }
1620      this.nextIndexedKey = nextIndexedKey;
1621      return seeker.seekToKeyInBlock(key, seekBefore);
1622    }
1623
1624    @Override
1625    public int compareKey(CellComparator comparator, ExtendedCell key) {
1626      return seeker.compareKey(comparator, key);
1627    }
1628  }
1629
1630  /**
1631   * Returns a buffer with the Bloom filter metadata. The caller takes ownership of the buffer.
1632   */
1633  @Override
1634  public DataInput getGeneralBloomFilterMetadata() throws IOException {
1635    return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1636  }
1637
1638  @Override
1639  public DataInput getDeleteBloomFilterMetadata() throws IOException {
1640    return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1641  }
1642
1643  private DataInput getBloomFilterMetadata(BlockType blockType) throws IOException {
1644    if (
1645      blockType != BlockType.GENERAL_BLOOM_META && blockType != BlockType.DELETE_FAMILY_BLOOM_META
1646    ) {
1647      throw new RuntimeException(
1648        "Block Type: " + blockType.toString() + " is not supported, path=" + path);
1649    }
1650
1651    for (HFileBlock b : fileInfo.getLoadOnOpenBlocks()) {
1652      if (b.getBlockType() == blockType) {
1653        return b.getByteStream();
1654      }
1655    }
1656    return null;
1657  }
1658
1659  public boolean isFileInfoLoaded() {
1660    return true; // We load file info in constructor in version 2.
1661  }
1662
1663  @Override
1664  public HFileContext getFileContext() {
1665    return hfileContext;
1666  }
1667
1668  /**
1669   * Returns false if block prefetching was requested for this file and has not completed, true
1670   * otherwise
1671   */
1672  @Override
1673  public boolean prefetchComplete() {
1674    return PrefetchExecutor.isCompleted(path);
1675  }
1676
1677  /**
1678   * Returns true if block prefetching was started after waiting for specified delay, false
1679   * otherwise
1680   */
1681  @Override
1682  public boolean prefetchStarted() {
1683    return PrefetchExecutor.isPrefetchStarted();
1684  }
1685
1686  /**
1687   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1688   * {@link HFileScanner#seekTo(ExtendedCell)} to position an start the read. There is nothing to
1689   * clean up in a Scanner. Letting go of your references to the scanner is sufficient. NOTE: Do not
1690   * use this overload of getScanner for compactions. See
1691   * {@link #getScanner(Configuration, boolean, boolean, boolean)}
1692   * @param conf        Store configuration.
1693   * @param cacheBlocks True if we should cache blocks read in by this scanner.
1694   * @param pread       Use positional read rather than seek+read if true (pread is better for
1695   *                    random reads, seek+read is better scanning).
1696   * @return Scanner on this file.
1697   */
1698  @Override
1699  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread) {
1700    return getScanner(conf, cacheBlocks, pread, false);
1701  }
1702
1703  /**
1704   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1705   * {@link HFileScanner#seekTo(ExtendedCell)} to position an start the read. There is nothing to
1706   * clean up in a Scanner. Letting go of your references to the scanner is sufficient.
1707   * @param conf         Store configuration.
1708   * @param cacheBlocks  True if we should cache blocks read in by this scanner.
1709   * @param pread        Use positional read rather than seek+read if true (pread is better for
1710   *                     random reads, seek+read is better scanning).
1711   * @param isCompaction is scanner being used for a compaction?
1712   * @return Scanner on this file.
1713   */
1714  @Override
1715  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread,
1716    final boolean isCompaction) {
1717    if (dataBlockEncoder.useEncodedScanner()) {
1718      return new EncodedScanner(this, cacheBlocks, pread, isCompaction, this.hfileContext, conf);
1719    }
1720    return new HFileScannerImpl(this, cacheBlocks, pread, isCompaction);
1721  }
1722
1723  public int getMajorVersion() {
1724    return 3;
1725  }
1726
1727  @Override
1728  public void unbufferStream() {
1729    fsBlockReader.unbufferStream();
1730  }
1731}