Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.BLOCK_CACHE_KEY_KEY;
021
022import io.opentelemetry.api.common.Attributes;
023import io.opentelemetry.api.trace.Span;
024import java.io.DataInput;
025import java.io.IOException;
026import java.nio.ByteBuffer;
027import java.util.ArrayList;
028import java.util.Optional;
029import java.util.function.IntConsumer;
030import org.apache.hadoop.conf.Configurable;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.Path;
033import org.apache.hadoop.hbase.ByteBufferKeyOnlyKeyValue;
034import org.apache.hadoop.hbase.Cell;
035import org.apache.hadoop.hbase.CellComparator;
036import org.apache.hadoop.hbase.CellUtil;
037import org.apache.hadoop.hbase.ExtendedCell;
038import org.apache.hadoop.hbase.HConstants;
039import org.apache.hadoop.hbase.KeyValue;
040import org.apache.hadoop.hbase.PrivateCellUtil;
041import org.apache.hadoop.hbase.SizeCachedByteBufferKeyValue;
042import org.apache.hadoop.hbase.SizeCachedKeyValue;
043import org.apache.hadoop.hbase.SizeCachedNoTagsByteBufferKeyValue;
044import org.apache.hadoop.hbase.SizeCachedNoTagsKeyValue;
045import org.apache.hadoop.hbase.io.compress.Compression;
046import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
047import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
048import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
049import org.apache.hadoop.hbase.nio.ByteBuff;
050import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
051import org.apache.hadoop.hbase.util.ByteBufferUtils;
052import org.apache.hadoop.hbase.util.Bytes;
053import org.apache.hadoop.hbase.util.IdLock;
054import org.apache.hadoop.hbase.util.ObjectIntPair;
055import org.apache.hadoop.io.WritableUtils;
056import org.apache.yetus.audience.InterfaceAudience;
057import org.slf4j.Logger;
058import org.slf4j.LoggerFactory;
059
060/**
061 * Implementation that can handle all hfile versions of {@link HFile.Reader}.
062 */
063@InterfaceAudience.Private
064@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
065public abstract class HFileReaderImpl implements HFile.Reader, Configurable {
066  // This class is HFileReaderV3 + HFileReaderV2 + AbstractHFileReader all squashed together into
067  // one file. Ditto for all the HFileReader.ScannerV? implementations. I was running up against
068  // the MaxInlineLevel limit because too many tiers involved reading from an hfile. Was also hard
069  // to navigate the source code when so many classes participating in read.
070  private static final Logger LOG = LoggerFactory.getLogger(HFileReaderImpl.class);
071
072  /** Data block index reader keeping the root data index in memory */
073  protected HFileBlockIndex.CellBasedKeyBlockIndexReader dataBlockIndexReader;
074
075  /** Meta block index reader -- always single level */
076  protected HFileBlockIndex.ByteArrayKeyBlockIndexReader metaBlockIndexReader;
077
078  protected FixedFileTrailer trailer;
079
080  private final boolean primaryReplicaReader;
081
082  /**
083   * What kind of data block encoding should be used while reading, writing, and handling cache.
084   */
085  protected HFileDataBlockEncoder dataBlockEncoder = NoOpDataBlockEncoder.INSTANCE;
086
087  /** Block cache configuration. */
088  protected final CacheConfig cacheConf;
089
090  protected ReaderContext context;
091
092  protected final HFileInfo fileInfo;
093
094  /** Path of file */
095  protected final Path path;
096
097  /** File name to be used for block names */
098  protected final String name;
099
100  private Configuration conf;
101
102  protected HFileContext hfileContext;
103
104  /** Filesystem-level block reader. */
105  protected HFileBlock.FSReader fsBlockReader;
106
107  /**
108   * A "sparse lock" implementation allowing to lock on a particular block identified by offset. The
109   * purpose of this is to avoid two clients loading the same block, and have all but one client
110   * wait to get the block from the cache.
111   */
112  private IdLock offsetLock = new IdLock();
113
114  /** Minimum minor version supported by this HFile format */
115  static final int MIN_MINOR_VERSION = 0;
116
117  /** Maximum minor version supported by this HFile format */
118  // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
119  // the file. This version can read Writables version 1.
120  static final int MAX_MINOR_VERSION = 3;
121
122  /** Minor versions starting with this number have faked index key */
123  static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
124
125  /**
126   * Opens a HFile.
127   * @param context   Reader context info
128   * @param fileInfo  HFile info
129   * @param cacheConf Cache configuration.
130   * @param conf      Configuration
131   */
132  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
133  public HFileReaderImpl(ReaderContext context, HFileInfo fileInfo, CacheConfig cacheConf,
134    Configuration conf) throws IOException {
135    this.cacheConf = cacheConf;
136    this.context = context;
137    this.path = context.getFilePath();
138    this.name = path.getName();
139    this.conf = conf;
140    this.primaryReplicaReader = context.isPrimaryReplicaReader();
141    this.fileInfo = fileInfo;
142    this.trailer = fileInfo.getTrailer();
143    this.hfileContext = fileInfo.getHFileContext();
144    this.fsBlockReader =
145      new HFileBlock.FSReaderImpl(context, hfileContext, cacheConf.getByteBuffAllocator(), conf);
146    this.dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
147    fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
148    dataBlockIndexReader = fileInfo.getDataBlockIndexReader();
149    metaBlockIndexReader = fileInfo.getMetaBlockIndexReader();
150  }
151
152  @SuppressWarnings("serial")
153  public static class BlockIndexNotLoadedException extends IllegalStateException {
154    public BlockIndexNotLoadedException(Path path) {
155      // Add a message in case anyone relies on it as opposed to class name.
156      super(path + " block index not loaded");
157    }
158  }
159
160  public CacheConfig getCacheConf() {
161    return cacheConf;
162  }
163
164  private Optional<String> toStringFirstKey() {
165    return getFirstKey().map(CellUtil::getCellKeyAsString);
166  }
167
168  private Optional<String> toStringLastKey() {
169    return getLastKey().map(CellUtil::getCellKeyAsString);
170  }
171
172  @Override
173  public String toString() {
174    return "reader=" + path.toString()
175      + (!isFileInfoLoaded()
176        ? ""
177        : ", compression=" + trailer.getCompressionCodec().getName() + ", cacheConf=" + cacheConf
178          + ", firstKey=" + toStringFirstKey() + ", lastKey=" + toStringLastKey())
179      + ", avgKeyLen=" + fileInfo.getAvgKeyLen() + ", avgValueLen=" + fileInfo.getAvgValueLen()
180      + ", entries=" + trailer.getEntryCount() + ", length=" + context.getFileSize();
181  }
182
183  @Override
184  public long length() {
185    return context.getFileSize();
186  }
187
188  /**
189   * @return the first key in the file. May be null if file has no entries. Note that this is not
190   *         the first row key, but rather the byte form of the first KeyValue.
191   */
192  @Override
193  public Optional<ExtendedCell> getFirstKey() {
194    if (dataBlockIndexReader == null) {
195      throw new BlockIndexNotLoadedException(path);
196    }
197    return dataBlockIndexReader.isEmpty()
198      ? Optional.empty()
199      : Optional.of(dataBlockIndexReader.getRootBlockKey(0));
200  }
201
202  /**
203   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
204   * eliminate {@link KeyValue} here.
205   * @return the first row key, or null if the file is empty.
206   */
207  @Override
208  public Optional<byte[]> getFirstRowKey() {
209    // We have to copy the row part to form the row key alone
210    return getFirstKey().map(CellUtil::cloneRow);
211  }
212
213  /**
214   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
215   * eliminate {@link KeyValue} here.
216   * @return the last row key, or null if the file is empty.
217   */
218  @Override
219  public Optional<byte[]> getLastRowKey() {
220    // We have to copy the row part to form the row key alone
221    return getLastKey().map(CellUtil::cloneRow);
222  }
223
224  /** Returns number of KV entries in this HFile */
225  @Override
226  public long getEntries() {
227    return trailer.getEntryCount();
228  }
229
230  /** Returns comparator */
231  @Override
232  public CellComparator getComparator() {
233    return this.hfileContext.getCellComparator();
234  }
235
236  public Compression.Algorithm getCompressionAlgorithm() {
237    return trailer.getCompressionCodec();
238  }
239
240  /**
241   * @return the total heap size of data and meta block indexes in bytes. Does not take into account
242   *         non-root blocks of a multilevel data index.
243   */
244  @Override
245  public long indexSize() {
246    return (dataBlockIndexReader != null ? dataBlockIndexReader.heapSize() : 0)
247      + ((metaBlockIndexReader != null) ? metaBlockIndexReader.heapSize() : 0);
248  }
249
250  @Override
251  public String getName() {
252    return name;
253  }
254
255  @Override
256  public void setDataBlockEncoder(HFileDataBlockEncoder dataBlockEncoder) {
257    this.dataBlockEncoder = dataBlockEncoder;
258    this.fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
259  }
260
261  @Override
262  public void setDataBlockIndexReader(HFileBlockIndex.CellBasedKeyBlockIndexReader reader) {
263    this.dataBlockIndexReader = reader;
264  }
265
266  @Override
267  public HFileBlockIndex.CellBasedKeyBlockIndexReader getDataBlockIndexReader() {
268    return dataBlockIndexReader;
269  }
270
271  @Override
272  public void setMetaBlockIndexReader(HFileBlockIndex.ByteArrayKeyBlockIndexReader reader) {
273    this.metaBlockIndexReader = reader;
274  }
275
276  @Override
277  public HFileBlockIndex.ByteArrayKeyBlockIndexReader getMetaBlockIndexReader() {
278    return metaBlockIndexReader;
279  }
280
281  @Override
282  public FixedFileTrailer getTrailer() {
283    return trailer;
284  }
285
286  @Override
287  public ReaderContext getContext() {
288    return this.context;
289  }
290
291  @Override
292  public HFileInfo getHFileInfo() {
293    return this.fileInfo;
294  }
295
296  @Override
297  public boolean isPrimaryReplicaReader() {
298    return primaryReplicaReader;
299  }
300
301  /**
302   * An exception thrown when an operation requiring a scanner to be seeked is invoked on a scanner
303   * that is not seeked.
304   */
305  @SuppressWarnings("serial")
306  public static class NotSeekedException extends IllegalStateException {
307    public NotSeekedException(Path path) {
308      super(path + " not seeked to a key/value");
309    }
310  }
311
312  public static class HFileScannerImpl implements HFileScanner {
313    private ByteBuff blockBuffer;
314    protected final boolean cacheBlocks;
315    protected final boolean pread;
316    protected final boolean isCompaction;
317    private int currKeyLen;
318    private int currValueLen;
319    private int currMemstoreTSLen;
320    private long currMemstoreTS;
321    protected final HFile.Reader reader;
322    private int currTagsLen;
323    private short rowLen;
324    // buffer backed keyonlyKV
325    private ByteBufferKeyOnlyKeyValue bufBackedKeyOnlyKv = new ByteBufferKeyOnlyKeyValue();
326    // A pair for reusing in blockSeek() so that we don't garbage lot of objects
327    final ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<>();
328
329    /**
330     * The next indexed key is to keep track of the indexed key of the next data block. If the
331     * nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the current data block is the
332     * last data block. If the nextIndexedKey is null, it means the nextIndexedKey has not been
333     * loaded yet.
334     */
335    protected ExtendedCell nextIndexedKey;
336
337    // Current block being used. NOTICE: DON't release curBlock separately except in shipped() or
338    // close() methods. Because the shipped() or close() will do the release finally, even if any
339    // exception occur the curBlock will be released by the close() method (see
340    // RegionScannerImpl#handleException). Call the releaseIfNotCurBlock() to release the
341    // unreferenced block please.
342    protected HFileBlock curBlock;
343    // Whether we returned a result for curBlock's size in recordBlockSize().
344    // gets reset whenever curBlock is changed.
345    private boolean providedCurrentBlockSize = false;
346
347    public HFileBlock getCurBlock() {
348      return curBlock;
349    }
350
351    // Previous blocks that were used in the course of the read
352    protected final ArrayList<HFileBlock> prevBlocks = new ArrayList<>();
353
354    public HFileScannerImpl(final HFile.Reader reader, final boolean cacheBlocks,
355      final boolean pread, final boolean isCompaction) {
356      this.reader = reader;
357      this.cacheBlocks = cacheBlocks;
358      this.pread = pread;
359      this.isCompaction = isCompaction;
360    }
361
362    void updateCurrBlockRef(HFileBlock block) {
363      if (block != null && curBlock != null && block.getOffset() == curBlock.getOffset()) {
364        return;
365      }
366      if (this.curBlock != null && this.curBlock.isSharedMem()) {
367        prevBlocks.add(this.curBlock);
368      }
369      this.curBlock = block;
370      this.providedCurrentBlockSize = false;
371    }
372
373    void reset() {
374      // We don't have to keep ref to heap block
375      if (this.curBlock != null && this.curBlock.isSharedMem()) {
376        this.prevBlocks.add(this.curBlock);
377      }
378      this.curBlock = null;
379    }
380
381    private void returnBlocks(boolean returnAll) {
382      this.prevBlocks.forEach(HFileBlock::release);
383      this.prevBlocks.clear();
384      if (returnAll && this.curBlock != null) {
385        this.curBlock.release();
386        this.curBlock = null;
387      }
388    }
389
390    @Override
391    public boolean isSeeked() {
392      return blockBuffer != null;
393    }
394
395    @Override
396    public String toString() {
397      return "HFileScanner for reader " + String.valueOf(getReader());
398    }
399
400    protected void assertSeeked() {
401      if (!isSeeked()) {
402        throw new NotSeekedException(reader.getPath());
403      }
404    }
405
406    @Override
407    public HFile.Reader getReader() {
408      return reader;
409    }
410
411    // From non encoded HFiles, we always read back KeyValue or its descendant.(Note: When HFile
412    // block is in DBB, it will be OffheapKV). So all parts of the Cell is in a contiguous
413    // array/buffer. How many bytes we should wrap to make the KV is what this method returns.
414    private int getKVBufSize() {
415      int kvBufSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen;
416      if (currTagsLen > 0) {
417        kvBufSize += Bytes.SIZEOF_SHORT + currTagsLen;
418      }
419      return kvBufSize;
420    }
421
422    @Override
423    public void close() {
424      if (!pread) {
425        // For seek + pread stream socket should be closed when the scanner is closed. HBASE-9393
426        reader.unbufferStream();
427      }
428      this.returnBlocks(true);
429    }
430
431    @Override
432    public void recordBlockSize(IntConsumer blockSizeConsumer) {
433      if (!providedCurrentBlockSize && curBlock != null) {
434        providedCurrentBlockSize = true;
435        blockSizeConsumer.accept(curBlock.getUncompressedSizeWithoutHeader());
436      }
437    }
438
439    // Returns the #bytes in HFile for the current cell. Used to skip these many bytes in current
440    // HFile block's buffer so as to position to the next cell.
441    private int getCurCellSerializedSize() {
442      int curCellSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen + currMemstoreTSLen;
443      if (this.reader.getFileContext().isIncludesTags()) {
444        curCellSize += Bytes.SIZEOF_SHORT + currTagsLen;
445      }
446      return curCellSize;
447    }
448
449    protected void readKeyValueLen() {
450      // This is a hot method. We go out of our way to make this method short so it can be
451      // inlined and is not too big to compile. We also manage position in ByteBuffer ourselves
452      // because it is faster than going via range-checked ByteBuffer methods or going through a
453      // byte buffer array a byte at a time.
454      // Get a long at a time rather than read two individual ints. In micro-benchmarking, even
455      // with the extra bit-fiddling, this is order-of-magnitude faster than getting two ints.
456      // Trying to imitate what was done - need to profile if this is better or
457      // earlier way is better by doing mark and reset?
458      // But ensure that you read long instead of two ints
459      long ll = blockBuffer.getLongAfterPosition(0);
460      // Read top half as an int of key length and bottom int as value length
461      this.currKeyLen = (int) (ll >> Integer.SIZE);
462      this.currValueLen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
463      checkKeyValueLen();
464      this.rowLen = blockBuffer.getShortAfterPosition(Bytes.SIZEOF_LONG);
465      // Move position past the key and value lengths and then beyond the key and value
466      int p = (Bytes.SIZEOF_LONG + currKeyLen + currValueLen);
467      if (reader.getFileContext().isIncludesTags()) {
468        // Tags length is a short.
469        this.currTagsLen = blockBuffer.getShortAfterPosition(p);
470        checkTagsLen();
471        p += (Bytes.SIZEOF_SHORT + currTagsLen);
472      }
473      readMvccVersion(p);
474    }
475
476    private final void checkTagsLen() {
477      if (checkLen(this.currTagsLen)) {
478        throw new IllegalStateException(
479          "Invalid currTagsLen " + this.currTagsLen + ". Block offset: " + curBlock.getOffset()
480            + ", block length: " + this.blockBuffer.limit() + ", position: "
481            + this.blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
482      }
483    }
484
485    /**
486     * Read mvcc. Does checks to see if we even need to read the mvcc at all.
487     */
488    protected void readMvccVersion(final int offsetFromPos) {
489      // See if we even need to decode mvcc.
490      if (!this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
491        return;
492      }
493      if (!this.reader.getHFileInfo().isDecodeMemstoreTS()) {
494        currMemstoreTS = 0;
495        currMemstoreTSLen = 1;
496        return;
497      }
498      _readMvccVersion(offsetFromPos);
499    }
500
501    /**
502     * Actually do the mvcc read. Does no checks.
503     */
504    private void _readMvccVersion(int offsetFromPos) {
505      // This is Bytes#bytesToVint inlined so can save a few instructions in this hot method; i.e.
506      // previous if one-byte vint, we'd redo the vint call to find int size.
507      // Also the method is kept small so can be inlined.
508      byte firstByte = blockBuffer.getByteAfterPosition(offsetFromPos);
509      int len = WritableUtils.decodeVIntSize(firstByte);
510      if (len == 1) {
511        this.currMemstoreTS = firstByte;
512      } else {
513        int remaining = len - 1;
514        long i = 0;
515        offsetFromPos++;
516        if (remaining >= Bytes.SIZEOF_INT) {
517          // The int read has to be converted to unsigned long so the & op
518          i = (blockBuffer.getIntAfterPosition(offsetFromPos) & 0x00000000ffffffffL);
519          remaining -= Bytes.SIZEOF_INT;
520          offsetFromPos += Bytes.SIZEOF_INT;
521        }
522        if (remaining >= Bytes.SIZEOF_SHORT) {
523          short s = blockBuffer.getShortAfterPosition(offsetFromPos);
524          i = i << 16;
525          i = i | (s & 0xFFFF);
526          remaining -= Bytes.SIZEOF_SHORT;
527          offsetFromPos += Bytes.SIZEOF_SHORT;
528        }
529        for (int idx = 0; idx < remaining; idx++) {
530          byte b = blockBuffer.getByteAfterPosition(offsetFromPos + idx);
531          i = i << 8;
532          i = i | (b & 0xFF);
533        }
534        currMemstoreTS = (WritableUtils.isNegativeVInt(firstByte) ? ~i : i);
535      }
536      this.currMemstoreTSLen = len;
537    }
538
539    /**
540     * Within a loaded block, seek looking for the last key that is smaller than (or equal to?) the
541     * key we are interested in. A note on the seekBefore: if you have seekBefore = true, AND the
542     * first key in the block = key, then you'll get thrown exceptions. The caller has to check for
543     * that case and load the previous block as appropriate. the key to find find the key before the
544     * given key in case of exact match.
545     * @return 0 in case of an exact key match, 1 in case of an inexact match, -2 in case of an
546     *         inexact match and furthermore, the input key less than the first key of current
547     *         block(e.g. using a faked index key)
548     */
549    protected int blockSeek(Cell key, boolean seekBefore) {
550      int klen, vlen, tlen = 0;
551      int lastKeyValueSize = -1;
552      int offsetFromPos;
553      do {
554        offsetFromPos = 0;
555        // Better to ensure that we use the BB Utils here
556        long ll = blockBuffer.getLongAfterPosition(offsetFromPos);
557        klen = (int) (ll >> Integer.SIZE);
558        vlen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
559        if (checkKeyLen(klen) || checkLen(vlen)) {
560          throw new IllegalStateException(
561            "Invalid klen " + klen + " or vlen " + vlen + ". Block offset: " + curBlock.getOffset()
562              + ", block length: " + blockBuffer.limit() + ", position: " + blockBuffer.position()
563              + " (without header)." + " path=" + reader.getPath());
564        }
565        offsetFromPos += Bytes.SIZEOF_LONG;
566        this.rowLen = blockBuffer.getShortAfterPosition(offsetFromPos);
567        blockBuffer.asSubByteBuffer(blockBuffer.position() + offsetFromPos, klen, pair);
568        bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), klen, rowLen);
569        int comp =
570          PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, bufBackedKeyOnlyKv);
571        offsetFromPos += klen + vlen;
572        if (this.reader.getFileContext().isIncludesTags()) {
573          // Read short as unsigned, high byte first
574          tlen = ((blockBuffer.getByteAfterPosition(offsetFromPos) & 0xff) << 8)
575            ^ (blockBuffer.getByteAfterPosition(offsetFromPos + 1) & 0xff);
576          if (checkLen(tlen)) {
577            throw new IllegalStateException("Invalid tlen " + tlen + ". Block offset: "
578              + curBlock.getOffset() + ", block length: " + blockBuffer.limit() + ", position: "
579              + blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
580          }
581          // add the two bytes read for the tags.
582          offsetFromPos += tlen + (Bytes.SIZEOF_SHORT);
583        }
584        if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
585          // Directly read the mvcc based on current position
586          readMvccVersion(offsetFromPos);
587        }
588        if (comp == 0) {
589          if (seekBefore) {
590            if (lastKeyValueSize < 0) {
591              throw new IllegalStateException("blockSeek with seekBefore "
592                + "at the first key of the block: key=" + CellUtil.getCellKeyAsString(key)
593                + ", blockOffset=" + curBlock.getOffset() + ", onDiskSize="
594                + curBlock.getOnDiskSizeWithHeader() + ", path=" + reader.getPath());
595            }
596            blockBuffer.moveBack(lastKeyValueSize);
597            readKeyValueLen();
598            return 1; // non exact match.
599          }
600          currKeyLen = klen;
601          currValueLen = vlen;
602          currTagsLen = tlen;
603          return 0; // indicate exact match
604        } else if (comp < 0) {
605          if (lastKeyValueSize > 0) {
606            blockBuffer.moveBack(lastKeyValueSize);
607          }
608          readKeyValueLen();
609          if (lastKeyValueSize == -1 && blockBuffer.position() == 0) {
610            return HConstants.INDEX_KEY_MAGIC;
611          }
612          return 1;
613        }
614        // The size of this key/value tuple, including key/value length fields.
615        lastKeyValueSize = klen + vlen + currMemstoreTSLen + KEY_VALUE_LEN_SIZE;
616        // include tag length also if tags included with KV
617        if (reader.getFileContext().isIncludesTags()) {
618          lastKeyValueSize += tlen + Bytes.SIZEOF_SHORT;
619        }
620        blockBuffer.skip(lastKeyValueSize);
621      } while (blockBuffer.hasRemaining());
622
623      // Seek to the last key we successfully read. This will happen if this is
624      // the last key/value pair in the file, in which case the following call
625      // to next() has to return false.
626      blockBuffer.moveBack(lastKeyValueSize);
627      readKeyValueLen();
628      return 1; // didn't exactly find it.
629    }
630
631    @Override
632    public ExtendedCell getNextIndexedKey() {
633      return nextIndexedKey;
634    }
635
636    @Override
637    public int seekTo(ExtendedCell key) throws IOException {
638      return seekTo(key, true);
639    }
640
641    @Override
642    public int reseekTo(ExtendedCell key) throws IOException {
643      int compared;
644      if (isSeeked()) {
645        compared = compareKey(reader.getComparator(), key);
646        if (compared < 1) {
647          // If the required key is less than or equal to current key, then
648          // don't do anything.
649          return compared;
650        } else {
651          // The comparison with no_next_index_key has to be checked
652          if (
653            this.nextIndexedKey != null && (this.nextIndexedKey
654                == KeyValueScanner.NO_NEXT_INDEXED_KEY
655              || PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, nextIndexedKey)
656                  < 0)
657          ) {
658            // The reader shall continue to scan the current data block instead
659            // of querying the
660            // block index as long as it knows the target key is strictly
661            // smaller than
662            // the next indexed key or the current data block is the last data
663            // block.
664            return loadBlockAndSeekToKey(this.curBlock, nextIndexedKey, false, key, false);
665          }
666        }
667      }
668      // Don't rewind on a reseek operation, because reseek implies that we are
669      // always going forward in the file.
670      return seekTo(key, false);
671    }
672
673    /**
674     * An internal API function. Seek to the given key, optionally rewinding to the first key of the
675     * block before doing the seek.
676     * @param key    - a cell representing the key that we need to fetch
677     * @param rewind whether to rewind to the first key of the block before doing the seek. If this
678     *               is false, we are assuming we never go back, otherwise the result is undefined.
679     * @return -1 if the key is earlier than the first key of the file, 0 if we are at the given
680     *         key, 1 if we are past the given key -2 if the key is earlier than the first key of
681     *         the file while using a faked index key
682     */
683    public int seekTo(ExtendedCell key, boolean rewind) throws IOException {
684      HFileBlockIndex.BlockIndexReader indexReader = reader.getDataBlockIndexReader();
685      BlockWithScanInfo blockWithScanInfo = indexReader.loadDataBlockWithScanInfo(key, curBlock,
686        cacheBlocks, pread, isCompaction, getEffectiveDataBlockEncoding(), reader);
687      if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
688        // This happens if the key e.g. falls before the beginning of the file.
689        return -1;
690      }
691      return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
692        blockWithScanInfo.getNextIndexedKey(), rewind, key, false);
693    }
694
695    @Override
696    public boolean seekBefore(ExtendedCell key) throws IOException {
697      HFileBlock seekToBlock = reader.getDataBlockIndexReader().seekToDataBlock(key, curBlock,
698        cacheBlocks, pread, isCompaction, reader.getEffectiveEncodingInCache(isCompaction), reader);
699      if (seekToBlock == null) {
700        return false;
701      }
702      ExtendedCell firstKey = getFirstKeyCellInBlock(seekToBlock);
703      if (PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), firstKey, key) >= 0) {
704        long previousBlockOffset = seekToBlock.getPrevBlockOffset();
705        // The key we are interested in
706        if (previousBlockOffset == -1) {
707          // we have a 'problem', the key we want is the first of the file.
708          releaseIfNotCurBlock(seekToBlock);
709          return false;
710        }
711
712        // The first key in the current block 'seekToBlock' is greater than the given
713        // seekBefore key. We will go ahead by reading the next block that satisfies the
714        // given key. Return the current block before reading the next one.
715        releaseIfNotCurBlock(seekToBlock);
716        // It is important that we compute and pass onDiskSize to the block
717        // reader so that it does not have to read the header separately to
718        // figure out the size. Currently, we do not have a way to do this
719        // correctly in the general case however.
720        // TODO: See https://issues.apache.org/jira/browse/HBASE-14576
721        int prevBlockSize = -1;
722        seekToBlock = reader.readBlock(previousBlockOffset, prevBlockSize, cacheBlocks, pread,
723          isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
724        // TODO shortcut: seek forward in this block to the last key of the
725        // block.
726      }
727      loadBlockAndSeekToKey(seekToBlock, firstKey, true, key, true);
728      return true;
729    }
730
731    /**
732     * The curBlock will be released by shipping or close method, so only need to consider releasing
733     * the block, which was read from HFile before and not referenced by curBlock.
734     */
735    protected void releaseIfNotCurBlock(HFileBlock block) {
736      if (curBlock != block) {
737        block.release();
738      }
739    }
740
741    /**
742     * Scans blocks in the "scanned" section of the {@link HFile} until the next data block is
743     * found.
744     * @return the next block, or null if there are no more data blocks
745     */
746    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_NULL_ON_SOME_PATH",
747        justification = "Yeah, unnecessary null check; could do w/ clean up")
748    protected HFileBlock readNextDataBlock() throws IOException {
749      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
750      if (curBlock == null) {
751        return null;
752      }
753      HFileBlock block = this.curBlock;
754      do {
755        if (block.getOffset() >= lastDataBlockOffset) {
756          releaseIfNotCurBlock(block);
757          return null;
758        }
759        if (block.getOffset() < 0) {
760          releaseIfNotCurBlock(block);
761          throw new IOException("Invalid block offset: " + block + ", path=" + reader.getPath());
762        }
763        // We are reading the next block without block type validation, because
764        // it might turn out to be a non-data block.
765        block = reader.readBlock(block.getOffset() + block.getOnDiskSizeWithHeader(),
766          block.getNextBlockOnDiskSize(), cacheBlocks, pread, isCompaction, true, null,
767          getEffectiveDataBlockEncoding());
768        if (block != null && !block.getBlockType().isData()) {
769          // Whatever block we read we will be returning it unless
770          // it is a datablock. Just in case the blocks are non data blocks
771          block.release();
772        }
773      } while (!block.getBlockType().isData());
774      return block;
775    }
776
777    public DataBlockEncoding getEffectiveDataBlockEncoding() {
778      return this.reader.getEffectiveEncodingInCache(isCompaction);
779    }
780
781    @Override
782    public ExtendedCell getCell() {
783      if (!isSeeked()) {
784        return null;
785      }
786
787      ExtendedCell ret;
788      int cellBufSize = getKVBufSize();
789      long seqId = 0L;
790      if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
791        seqId = currMemstoreTS;
792      }
793      if (blockBuffer.hasArray()) {
794        // TODO : reduce the varieties of KV here. Check if based on a boolean
795        // we can handle the 'no tags' case.
796        if (currTagsLen > 0) {
797          ret = new SizeCachedKeyValue(blockBuffer.array(),
798            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
799            rowLen);
800        } else {
801          ret = new SizeCachedNoTagsKeyValue(blockBuffer.array(),
802            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
803            rowLen);
804        }
805      } else {
806        ByteBuffer buf = blockBuffer.asSubByteBuffer(cellBufSize);
807        if (buf.isDirect()) {
808          ret = currTagsLen > 0
809            ? new SizeCachedByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId, currKeyLen,
810              rowLen)
811            : new SizeCachedNoTagsByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId,
812              currKeyLen, rowLen);
813        } else {
814          if (currTagsLen > 0) {
815            ret = new SizeCachedKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
816              cellBufSize, seqId, currKeyLen, rowLen);
817          } else {
818            ret = new SizeCachedNoTagsKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
819              cellBufSize, seqId, currKeyLen, rowLen);
820          }
821        }
822      }
823      return ret;
824    }
825
826    @Override
827    public ExtendedCell getKey() {
828      assertSeeked();
829      // Create a new object so that this getKey is cached as firstKey, lastKey
830      ObjectIntPair<ByteBuffer> keyPair = new ObjectIntPair<>();
831      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, keyPair);
832      ByteBuffer keyBuf = keyPair.getFirst();
833      if (keyBuf.hasArray()) {
834        return new KeyValue.KeyOnlyKeyValue(keyBuf.array(),
835          keyBuf.arrayOffset() + keyPair.getSecond(), currKeyLen);
836      } else {
837        // Better to do a copy here instead of holding on to this BB so that
838        // we could release the blocks referring to this key. This key is specifically used
839        // in HalfStoreFileReader to get the firstkey and lastkey by creating a new scanner
840        // every time. So holding onto the BB (incase of DBB) is not advised here.
841        byte[] key = new byte[currKeyLen];
842        ByteBufferUtils.copyFromBufferToArray(key, keyBuf, keyPair.getSecond(), 0, currKeyLen);
843        return new KeyValue.KeyOnlyKeyValue(key, 0, currKeyLen);
844      }
845    }
846
847    @Override
848    public ByteBuffer getValue() {
849      assertSeeked();
850      // Okie to create new Pair. Not used in hot path
851      ObjectIntPair<ByteBuffer> valuePair = new ObjectIntPair<>();
852      this.blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
853        currValueLen, valuePair);
854      ByteBuffer valBuf = valuePair.getFirst().duplicate();
855      valBuf.position(valuePair.getSecond());
856      valBuf.limit(currValueLen + valuePair.getSecond());
857      return valBuf.slice();
858    }
859
860    protected void setNonSeekedState() {
861      reset();
862      blockBuffer = null;
863      currKeyLen = 0;
864      currValueLen = 0;
865      currMemstoreTS = 0;
866      currMemstoreTSLen = 0;
867      currTagsLen = 0;
868    }
869
870    /**
871     * Set the position on current backing blockBuffer.
872     */
873    private void positionThisBlockBuffer() {
874      try {
875        blockBuffer.skip(getCurCellSerializedSize());
876      } catch (IllegalArgumentException e) {
877        LOG.error("Current pos = " + blockBuffer.position() + "; currKeyLen = " + currKeyLen
878          + "; currValLen = " + currValueLen + "; block limit = " + blockBuffer.limit()
879          + "; currBlock currBlockOffset = " + this.curBlock.getOffset() + "; path="
880          + reader.getPath());
881        throw e;
882      }
883    }
884
885    /**
886     * Set our selves up for the next 'next' invocation, set up next block.
887     * @return True is more to read else false if at the end.
888     */
889    private boolean positionForNextBlock() throws IOException {
890      // Methods are small so they get inlined because they are 'hot'.
891      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
892      if (this.curBlock.getOffset() >= lastDataBlockOffset) {
893        setNonSeekedState();
894        return false;
895      }
896      return isNextBlock();
897    }
898
899    private boolean isNextBlock() throws IOException {
900      // Methods are small so they get inlined because they are 'hot'.
901      HFileBlock nextBlock = readNextDataBlock();
902      if (nextBlock == null) {
903        setNonSeekedState();
904        return false;
905      }
906      updateCurrentBlock(nextBlock);
907      return true;
908    }
909
910    private final boolean _next() throws IOException {
911      // Small method so can be inlined. It is a hot one.
912      if (blockBuffer.remaining() <= 0) {
913        return positionForNextBlock();
914      }
915
916      // We are still in the same block.
917      readKeyValueLen();
918      return true;
919    }
920
921    /**
922     * Go to the next key/value in the block section. Loads the next block if necessary. If
923     * successful, {@link #getKey()} and {@link #getValue()} can be called.
924     * @return true if successfully navigated to the next key/value
925     */
926    @Override
927    public boolean next() throws IOException {
928      // This is a hot method so extreme measures taken to ensure it is small and inlineable.
929      // Checked by setting: -XX:+UnlockDiagnosticVMOptions -XX:+PrintInlining -XX:+PrintCompilation
930      assertSeeked();
931      positionThisBlockBuffer();
932      return _next();
933    }
934
935    /**
936     * Positions this scanner at the start of the file.
937     * @return false if empty file; i.e. a call to next would return false and the current key and
938     *         value are undefined.
939     */
940    @Override
941    public boolean seekTo() throws IOException {
942      if (reader == null) {
943        return false;
944      }
945
946      if (reader.getTrailer().getEntryCount() == 0) {
947        // No data blocks.
948        return false;
949      }
950
951      long firstDataBlockOffset = reader.getTrailer().getFirstDataBlockOffset();
952      if (curBlock != null && curBlock.getOffset() == firstDataBlockOffset) {
953        return processFirstDataBlock();
954      }
955
956      readAndUpdateNewBlock(firstDataBlockOffset);
957      return true;
958    }
959
960    protected boolean processFirstDataBlock() throws IOException {
961      blockBuffer.rewind();
962      readKeyValueLen();
963      return true;
964    }
965
966    protected void readAndUpdateNewBlock(long firstDataBlockOffset) throws IOException {
967      HFileBlock newBlock = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
968        isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
969      if (newBlock.getOffset() < 0) {
970        releaseIfNotCurBlock(newBlock);
971        throw new IOException(
972          "Invalid offset=" + newBlock.getOffset() + ", path=" + reader.getPath());
973      }
974      updateCurrentBlock(newBlock);
975    }
976
977    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, ExtendedCell nextIndexedKey,
978      boolean rewind, ExtendedCell key, boolean seekBefore) throws IOException {
979      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
980        updateCurrentBlock(seekToBlock);
981      } else if (rewind) {
982        blockBuffer.rewind();
983      }
984      // Update the nextIndexedKey
985      this.nextIndexedKey = nextIndexedKey;
986      return blockSeek(key, seekBefore);
987    }
988
989    /** Returns True if v &lt;= 0 or v &gt; current block buffer limit. */
990    protected final boolean checkKeyLen(final int v) {
991      return v <= 0 || v > this.blockBuffer.limit();
992    }
993
994    /** Returns True if v &lt; 0 or v &gt; current block buffer limit. */
995    protected final boolean checkLen(final int v) {
996      return v < 0 || v > this.blockBuffer.limit();
997    }
998
999    /**
1000     * Check key and value lengths are wholesome.
1001     */
1002    protected final void checkKeyValueLen() {
1003      if (checkKeyLen(this.currKeyLen) || checkLen(this.currValueLen)) {
1004        throw new IllegalStateException("Invalid currKeyLen " + this.currKeyLen
1005          + " or currValueLen " + this.currValueLen + ". Block offset: " + this.curBlock.getOffset()
1006          + ", block length: " + this.blockBuffer.limit() + ", position: "
1007          + this.blockBuffer.position() + " (without header)." + ", path=" + reader.getPath());
1008      }
1009    }
1010
1011    /**
1012     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1013     * key/value pair.
1014     * @param newBlock the block read by {@link HFileReaderImpl#readBlock}, it's a totally new block
1015     *                 with new allocated {@link ByteBuff}, so if no further reference to this
1016     *                 block, we should release it carefully.
1017     */
1018    protected void updateCurrentBlock(HFileBlock newBlock) throws IOException {
1019      try {
1020        if (newBlock.getBlockType() != BlockType.DATA) {
1021          throw new IllegalStateException(
1022            "ScannerV2 works only on data blocks, got " + newBlock.getBlockType() + "; "
1023              + "HFileName=" + reader.getPath() + ", " + "dataBlockEncoder="
1024              + reader.getDataBlockEncoding() + ", " + "isCompaction=" + isCompaction);
1025        }
1026        updateCurrBlockRef(newBlock);
1027        blockBuffer = newBlock.getBufferWithoutHeader();
1028        readKeyValueLen();
1029      } finally {
1030        releaseIfNotCurBlock(newBlock);
1031      }
1032      // Reset the next indexed key
1033      this.nextIndexedKey = null;
1034    }
1035
1036    protected ExtendedCell getFirstKeyCellInBlock(HFileBlock curBlock) {
1037      ByteBuff buffer = curBlock.getBufferWithoutHeader();
1038      // It is safe to manipulate this buffer because we own the buffer object.
1039      buffer.rewind();
1040      int klen = buffer.getInt();
1041      buffer.skip(Bytes.SIZEOF_INT);// Skip value len part
1042      ByteBuffer keyBuff = buffer.asSubByteBuffer(klen);
1043      if (keyBuff.hasArray()) {
1044        return new KeyValue.KeyOnlyKeyValue(keyBuff.array(),
1045          keyBuff.arrayOffset() + keyBuff.position(), klen);
1046      } else {
1047        return new ByteBufferKeyOnlyKeyValue(keyBuff, keyBuff.position(), klen);
1048      }
1049    }
1050
1051    public int compareKey(CellComparator comparator, ExtendedCell key) {
1052      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, pair);
1053      this.bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), currKeyLen, rowLen);
1054      return PrivateCellUtil.compareKeyIgnoresMvcc(comparator, key, this.bufBackedKeyOnlyKv);
1055    }
1056
1057    @Override
1058    public void shipped() throws IOException {
1059      this.returnBlocks(false);
1060    }
1061  }
1062
1063  @Override
1064  public Path getPath() {
1065    return path;
1066  }
1067
1068  @Override
1069  public DataBlockEncoding getDataBlockEncoding() {
1070    return dataBlockEncoder.getDataBlockEncoding();
1071  }
1072
1073  @Override
1074  public Configuration getConf() {
1075    return conf;
1076  }
1077
1078  @Override
1079  public void setConf(Configuration conf) {
1080    this.conf = conf;
1081  }
1082
1083  /** Minor versions in HFile starting with this number have hbase checksums */
1084  public static final int MINOR_VERSION_WITH_CHECKSUM = 1;
1085  /** In HFile minor version that does not support checksums */
1086  public static final int MINOR_VERSION_NO_CHECKSUM = 0;
1087
1088  /** HFile minor version that introduced pbuf filetrailer */
1089  public static final int PBUF_TRAILER_MINOR_VERSION = 2;
1090
1091  /**
1092   * The size of a (key length, value length) tuple that prefixes each entry in a data block.
1093   */
1094  public final static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
1095
1096  /**
1097   * Retrieve block from cache. Validates the retrieved block's type vs {@code expectedBlockType}
1098   * and its encoding vs. {@code expectedDataBlockEncoding}. Unpacks the block as necessary.
1099   */
1100  private HFileBlock getCachedBlock(BlockCacheKey cacheKey, boolean cacheBlock, boolean useLock,
1101    boolean updateCacheMetrics, BlockType expectedBlockType,
1102    DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1103    // Check cache for block. If found return.
1104    BlockCache cache = cacheConf.getBlockCache().orElse(null);
1105    if (cache != null) {
1106      HFileBlock cachedBlock = (HFileBlock) cache.getBlock(cacheKey, cacheBlock, useLock,
1107        updateCacheMetrics, expectedBlockType);
1108      if (cachedBlock != null) {
1109        if (cacheConf.shouldCacheCompressed(cachedBlock.getBlockType().getCategory())) {
1110          HFileBlock compressedBlock = cachedBlock;
1111          cachedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1112          // In case of compressed block after unpacking we can release the compressed block
1113          if (compressedBlock != cachedBlock) {
1114            compressedBlock.release();
1115          }
1116        }
1117        try {
1118          validateBlockType(cachedBlock, expectedBlockType);
1119        } catch (IOException e) {
1120          returnAndEvictBlock(cache, cacheKey, cachedBlock);
1121          throw e;
1122        }
1123
1124        if (expectedDataBlockEncoding == null) {
1125          return cachedBlock;
1126        }
1127        DataBlockEncoding actualDataBlockEncoding = cachedBlock.getDataBlockEncoding();
1128        // Block types other than data blocks always have
1129        // DataBlockEncoding.NONE. To avoid false negative cache misses, only
1130        // perform this check if cached block is a data block.
1131        if (
1132          cachedBlock.getBlockType().isData()
1133            && !actualDataBlockEncoding.equals(expectedDataBlockEncoding)
1134        ) {
1135          // This mismatch may happen if a Scanner, which is used for say a
1136          // compaction, tries to read an encoded block from the block cache.
1137          // The reverse might happen when an EncodedScanner tries to read
1138          // un-encoded blocks which were cached earlier.
1139          //
1140          // Because returning a data block with an implicit BlockType mismatch
1141          // will cause the requesting scanner to throw a disk read should be
1142          // forced here. This will potentially cause a significant number of
1143          // cache misses, so update so we should keep track of this as it might
1144          // justify the work on a CompoundScanner.
1145          if (
1146            !expectedDataBlockEncoding.equals(DataBlockEncoding.NONE)
1147              && !actualDataBlockEncoding.equals(DataBlockEncoding.NONE)
1148          ) {
1149            // If the block is encoded but the encoding does not match the
1150            // expected encoding it is likely the encoding was changed but the
1151            // block was not yet evicted. Evictions on file close happen async
1152            // so blocks with the old encoding still linger in cache for some
1153            // period of time. This event should be rare as it only happens on
1154            // schema definition change.
1155            LOG.info(
1156              "Evicting cached block with key {} because data block encoding mismatch; "
1157                + "expected {}, actual {}, path={}",
1158              cacheKey, actualDataBlockEncoding, expectedDataBlockEncoding, path);
1159            // This is an error scenario. so here we need to release the block.
1160            returnAndEvictBlock(cache, cacheKey, cachedBlock);
1161          }
1162          return null;
1163        }
1164        return cachedBlock;
1165      }
1166    }
1167    return null;
1168  }
1169
1170  private void returnAndEvictBlock(BlockCache cache, BlockCacheKey cacheKey, Cacheable block) {
1171    block.release();
1172    cache.evictBlock(cacheKey);
1173  }
1174
1175  /**
1176   * @param cacheBlock Add block to cache, if found
1177   * @return block wrapped in a ByteBuffer, with header skipped
1178   */
1179  @Override
1180  public HFileBlock getMetaBlock(String metaBlockName, boolean cacheBlock) throws IOException {
1181    if (trailer.getMetaIndexCount() == 0) {
1182      return null; // there are no meta blocks
1183    }
1184    if (metaBlockIndexReader == null) {
1185      throw new IOException(path + " meta index not loaded");
1186    }
1187
1188    byte[] mbname = Bytes.toBytes(metaBlockName);
1189    int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0, mbname.length);
1190    if (block == -1) {
1191      return null;
1192    }
1193    long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
1194
1195    // Per meta key from any given file, synchronize reads for said block. This
1196    // is OK to do for meta blocks because the meta block index is always
1197    // single-level.
1198    synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
1199      // Check cache for block. If found return.
1200      long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
1201      BlockCacheKey cacheKey =
1202        new BlockCacheKey(name, metaBlockOffset, this.isPrimaryReplicaReader(), BlockType.META);
1203
1204      cacheBlock &= cacheConf.shouldCacheBlockOnRead(BlockType.META.getCategory());
1205      HFileBlock cachedBlock =
1206        getCachedBlock(cacheKey, cacheBlock, false, true, BlockType.META, null);
1207      if (cachedBlock != null) {
1208        assert cachedBlock.isUnpacked() : "Packed block leak.";
1209        // Return a distinct 'shallow copy' of the block,
1210        // so pos does not get messed by the scanner
1211        return cachedBlock;
1212      }
1213      // Cache Miss, please load.
1214
1215      HFileBlock compressedBlock =
1216        fsBlockReader.readBlockData(metaBlockOffset, blockSize, true, false, true);
1217      HFileBlock uncompressedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1218      if (compressedBlock != uncompressedBlock) {
1219        compressedBlock.release();
1220      }
1221
1222      // Cache the block
1223      if (cacheBlock) {
1224        cacheConf.getBlockCache().ifPresent(
1225          cache -> cache.cacheBlock(cacheKey, uncompressedBlock, cacheConf.isInMemory()));
1226      }
1227      return uncompressedBlock;
1228    }
1229  }
1230
1231  /**
1232   * Whether we use heap or not depends on our intent to cache the block. We want to avoid
1233   * allocating to off-heap if we intend to cache into the on-heap L1 cache. Otherwise, it's more
1234   * efficient to allocate to off-heap since we can control GC ourselves for those. So our decision
1235   * here breaks down as follows: <br>
1236   * If block cache is disabled, don't use heap. If we're not using the CombinedBlockCache, use heap
1237   * unless caching is disabled for the request. Otherwise, only use heap if caching is enabled and
1238   * the expected block type is not DATA (which goes to off-heap L2 in combined cache).
1239   * @see org.apache.hadoop.hbase.io.hfile.HFileBlock.FSReader#readBlockData(long, long, boolean,
1240   *      boolean, boolean)
1241   */
1242  private boolean shouldUseHeap(BlockType expectedBlockType, boolean cacheBlock) {
1243    if (!cacheConf.getBlockCache().isPresent()) {
1244      return false;
1245    }
1246
1247    // we only cache a block if cacheBlock is true and caching-on-read is enabled in CacheConfig
1248    // we can really only check for that if have an expectedBlockType
1249    if (expectedBlockType != null) {
1250      cacheBlock &= cacheConf.shouldCacheBlockOnRead(expectedBlockType.getCategory());
1251    }
1252
1253    if (!cacheConf.isCombinedBlockCache()) {
1254      // Block to cache in LruBlockCache must be an heap one, if caching enabled. So just allocate
1255      // block memory from heap for saving an extra off-heap to heap copying in that case.
1256      return cacheBlock;
1257    }
1258
1259    return cacheBlock && expectedBlockType != null && !expectedBlockType.isData();
1260  }
1261
1262  @Override
1263  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1264    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1265    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1266    return readBlock(dataBlockOffset, onDiskBlockSize, cacheBlock, pread, isCompaction,
1267      updateCacheMetrics, expectedBlockType, expectedDataBlockEncoding, false);
1268  }
1269
1270  @Override
1271  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1272    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1273    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding, boolean cacheOnly)
1274    throws IOException {
1275    if (dataBlockIndexReader == null) {
1276      throw new IOException(path + " block index not loaded");
1277    }
1278    long trailerOffset = trailer.getLoadOnOpenDataOffset();
1279    if (dataBlockOffset < 0 || dataBlockOffset >= trailerOffset) {
1280      throw new IOException("Requested block is out of range: " + dataBlockOffset
1281        + ", lastDataBlockOffset: " + trailer.getLastDataBlockOffset()
1282        + ", trailer.getLoadOnOpenDataOffset: " + trailerOffset + ", path=" + path);
1283    }
1284    // For any given block from any given file, synchronize reads for said
1285    // block.
1286    // Without a cache, this synchronizing is needless overhead, but really
1287    // the other choice is to duplicate work (which the cache would prevent you
1288    // from doing).
1289
1290    BlockCacheKey cacheKey =
1291      new BlockCacheKey(path, dataBlockOffset, this.isPrimaryReplicaReader(), expectedBlockType);
1292
1293    boolean useLock = false;
1294    IdLock.Entry lockEntry = null;
1295    final Span span = Span.current();
1296    // BlockCacheKey#toString() is quite expensive to call, so if tracing isn't enabled, don't
1297    // record
1298    Attributes attributes = span.isRecording()
1299      ? Attributes.of(BLOCK_CACHE_KEY_KEY, cacheKey.toString())
1300      : Attributes.empty();
1301    try {
1302      while (true) {
1303        // Check cache for block. If found return.
1304        if (cacheConf.shouldReadBlockFromCache(expectedBlockType) && !cacheOnly) {
1305          if (useLock) {
1306            lockEntry = offsetLock.getLockEntry(dataBlockOffset);
1307          }
1308          // Try and get the block from the block cache. If the useLock variable is true then this
1309          // is the second time through the loop and it should not be counted as a block cache miss.
1310          HFileBlock cachedBlock = getCachedBlock(cacheKey, cacheBlock, useLock, updateCacheMetrics,
1311            expectedBlockType, expectedDataBlockEncoding);
1312          if (cachedBlock != null) {
1313            if (LOG.isTraceEnabled()) {
1314              LOG.trace("Block for file {} is coming from Cache {}",
1315                Bytes.toString(cachedBlock.getHFileContext().getTableName()), cachedBlock);
1316            }
1317            span.addEvent("block cache hit", attributes);
1318            assert cachedBlock.isUnpacked() : "Packed block leak.";
1319            if (cachedBlock.getBlockType().isData()) {
1320              if (updateCacheMetrics) {
1321                HFile.DATABLOCK_READ_COUNT.increment();
1322              }
1323              // Validate encoding type for data blocks. We include encoding
1324              // type in the cache key, and we expect it to match on a cache hit.
1325              if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
1326                // Remember to release the block when in exceptional path.
1327                cacheConf.getBlockCache().ifPresent(cache -> {
1328                  returnAndEvictBlock(cache, cacheKey, cachedBlock);
1329                });
1330                throw new IOException("Cached block under key " + cacheKey + " "
1331                  + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
1332                  + dataBlockEncoder.getDataBlockEncoding() + "), path=" + path);
1333              }
1334            }
1335            // Cache-hit. Return!
1336            return cachedBlock;
1337          }
1338
1339          if (!useLock && cacheBlock && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
1340            // check cache again with lock
1341            useLock = true;
1342            continue;
1343          }
1344          // Carry on, please load.
1345        }
1346
1347        span.addEvent("block cache miss", attributes);
1348        // Load block from filesystem.
1349        HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, pread,
1350          !isCompaction, shouldUseHeap(expectedBlockType, cacheBlock));
1351        try {
1352          validateBlockType(hfileBlock, expectedBlockType);
1353        } catch (IOException e) {
1354          hfileBlock.release();
1355          throw e;
1356        }
1357        BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory();
1358        final boolean cacheCompressed = cacheConf.shouldCacheCompressed(category);
1359        final boolean cacheOnRead = cacheConf.shouldCacheBlockOnRead(category);
1360
1361        // Don't need the unpacked block back and we're storing the block in the cache compressed
1362        if (cacheOnly && cacheCompressed && cacheOnRead) {
1363          HFileBlock blockNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock);
1364          cacheConf.getBlockCache().ifPresent(cache -> {
1365            LOG.debug("Skipping decompression of block {} in prefetch", cacheKey);
1366            // Cache the block if necessary
1367            if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
1368              cache.cacheBlock(cacheKey, blockNoChecksum, cacheConf.isInMemory(), cacheOnly);
1369            }
1370          });
1371
1372          if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1373            HFile.DATABLOCK_READ_COUNT.increment();
1374          }
1375          return blockNoChecksum;
1376        }
1377        HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
1378        HFileBlock unpackedNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, unpacked);
1379        // Cache the block if necessary
1380        cacheConf.getBlockCache().ifPresent(cache -> {
1381          if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
1382            // Using the wait on cache during compaction and prefetching.
1383            cache.cacheBlock(cacheKey,
1384              cacheCompressed
1385                ? BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock)
1386                : unpackedNoChecksum,
1387              cacheConf.isInMemory(), cacheOnly);
1388          }
1389        });
1390        if (unpacked != hfileBlock) {
1391          // End of life here if hfileBlock is an independent block.
1392          hfileBlock.release();
1393        }
1394        if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1395          HFile.DATABLOCK_READ_COUNT.increment();
1396        }
1397
1398        return unpackedNoChecksum;
1399      }
1400    } finally {
1401      if (lockEntry != null) {
1402        offsetLock.releaseLockEntry(lockEntry);
1403      }
1404    }
1405  }
1406
1407  @Override
1408  public boolean hasMVCCInfo() {
1409    return fileInfo.shouldIncludeMemStoreTS() && fileInfo.isDecodeMemstoreTS();
1410  }
1411
1412  /**
1413   * Compares the actual type of a block retrieved from cache or disk with its expected type and
1414   * throws an exception in case of a mismatch. Expected block type of {@link BlockType#DATA} is
1415   * considered to match the actual block type [@link {@link BlockType#ENCODED_DATA} as well.
1416   * @param block             a block retrieved from cache or disk
1417   * @param expectedBlockType the expected block type, or null to skip the check
1418   */
1419  private void validateBlockType(HFileBlock block, BlockType expectedBlockType) throws IOException {
1420    if (expectedBlockType == null) {
1421      return;
1422    }
1423    BlockType actualBlockType = block.getBlockType();
1424    if (expectedBlockType.isData() && actualBlockType.isData()) {
1425      // We consider DATA to match ENCODED_DATA for the purpose of this
1426      // verification.
1427      return;
1428    }
1429    if (actualBlockType != expectedBlockType) {
1430      throw new IOException("Expected block type " + expectedBlockType + ", " + "but got "
1431        + actualBlockType + ": " + block + ", path=" + path);
1432    }
1433  }
1434
1435  /**
1436   * @return Last key as cell in the file. May be null if file has no entries. Note that this is not
1437   *         the last row key, but it is the Cell representation of the last key
1438   */
1439  @Override
1440  public Optional<ExtendedCell> getLastKey() {
1441    return dataBlockIndexReader.isEmpty()
1442      ? Optional.empty()
1443      : Optional.of(fileInfo.getLastKeyCell());
1444  }
1445
1446  /**
1447   * @return Midkey for this file. We work with block boundaries only so returned midkey is an
1448   *         approximation only.
1449   */
1450  @Override
1451  public Optional<ExtendedCell> midKey() throws IOException {
1452    return Optional.ofNullable(dataBlockIndexReader.midkey(this));
1453  }
1454
1455  @Override
1456  public void close() throws IOException {
1457    close(cacheConf.shouldEvictOnClose());
1458  }
1459
1460  @Override
1461  public DataBlockEncoding getEffectiveEncodingInCache(boolean isCompaction) {
1462    return dataBlockEncoder.getEffectiveEncodingInCache(isCompaction);
1463  }
1464
1465  /** For testing */
1466  @Override
1467  public HFileBlock.FSReader getUncachedBlockReader() {
1468    return fsBlockReader;
1469  }
1470
1471  /**
1472   * Scanner that operates on encoded data blocks.
1473   */
1474  protected static class EncodedScanner extends HFileScannerImpl {
1475    private final HFileBlockDecodingContext decodingCtx;
1476    private final DataBlockEncoder.EncodedSeeker seeker;
1477    private final DataBlockEncoder dataBlockEncoder;
1478
1479    public EncodedScanner(HFile.Reader reader, boolean cacheBlocks, boolean pread,
1480      boolean isCompaction, HFileContext meta, Configuration conf) {
1481      super(reader, cacheBlocks, pread, isCompaction);
1482      DataBlockEncoding encoding = reader.getDataBlockEncoding();
1483      dataBlockEncoder = encoding.getEncoder();
1484      decodingCtx = dataBlockEncoder.newDataBlockDecodingContext(conf, meta);
1485      seeker = dataBlockEncoder.createSeeker(decodingCtx);
1486    }
1487
1488    @Override
1489    public boolean isSeeked() {
1490      return curBlock != null;
1491    }
1492
1493    @Override
1494    public void setNonSeekedState() {
1495      reset();
1496    }
1497
1498    /**
1499     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1500     * key/value pair.
1501     * @param newBlock the block to make current, and read by {@link HFileReaderImpl#readBlock},
1502     *                 it's a totally new block with new allocated {@link ByteBuff}, so if no
1503     *                 further reference to this block, we should release it carefully.
1504     */
1505    @Override
1506    protected void updateCurrentBlock(HFileBlock newBlock) throws CorruptHFileException {
1507      try {
1508        // sanity checks
1509        if (newBlock.getBlockType() != BlockType.ENCODED_DATA) {
1510          throw new IllegalStateException("EncodedScanner works only on encoded data blocks");
1511        }
1512        short dataBlockEncoderId = newBlock.getDataBlockEncodingId();
1513        if (!DataBlockEncoding.isCorrectEncoder(dataBlockEncoder, dataBlockEncoderId)) {
1514          String encoderCls = dataBlockEncoder.getClass().getName();
1515          throw new CorruptHFileException(
1516            "Encoder " + encoderCls + " doesn't support data block encoding "
1517              + DataBlockEncoding.getNameFromId(dataBlockEncoderId) + ",path=" + reader.getPath());
1518        }
1519        updateCurrBlockRef(newBlock);
1520        ByteBuff encodedBuffer = getEncodedBuffer(newBlock);
1521        seeker.setCurrentBuffer(encodedBuffer);
1522      } finally {
1523        releaseIfNotCurBlock(newBlock);
1524      }
1525      // Reset the next indexed key
1526      this.nextIndexedKey = null;
1527    }
1528
1529    private ByteBuff getEncodedBuffer(HFileBlock newBlock) {
1530      ByteBuff origBlock = newBlock.getBufferReadOnly();
1531      int pos = newBlock.headerSize() + DataBlockEncoding.ID_SIZE;
1532      origBlock.position(pos);
1533      origBlock
1534        .limit(pos + newBlock.getUncompressedSizeWithoutHeader() - DataBlockEncoding.ID_SIZE);
1535      return origBlock.slice();
1536    }
1537
1538    @Override
1539    protected boolean processFirstDataBlock() throws IOException {
1540      seeker.rewind();
1541      return true;
1542    }
1543
1544    @Override
1545    public boolean next() throws IOException {
1546      boolean isValid = seeker.next();
1547      if (!isValid) {
1548        HFileBlock newBlock = readNextDataBlock();
1549        isValid = newBlock != null;
1550        if (isValid) {
1551          updateCurrentBlock(newBlock);
1552        } else {
1553          setNonSeekedState();
1554        }
1555      }
1556      return isValid;
1557    }
1558
1559    @Override
1560    public ExtendedCell getKey() {
1561      assertValidSeek();
1562      return seeker.getKey();
1563    }
1564
1565    @Override
1566    public ByteBuffer getValue() {
1567      assertValidSeek();
1568      return seeker.getValueShallowCopy();
1569    }
1570
1571    @Override
1572    public ExtendedCell getCell() {
1573      if (this.curBlock == null) {
1574        return null;
1575      }
1576      return seeker.getCell();
1577    }
1578
1579    private void assertValidSeek() {
1580      if (this.curBlock == null) {
1581        throw new NotSeekedException(reader.getPath());
1582      }
1583    }
1584
1585    @Override
1586    protected ExtendedCell getFirstKeyCellInBlock(HFileBlock curBlock) {
1587      return dataBlockEncoder.getFirstKeyCellInBlock(getEncodedBuffer(curBlock));
1588    }
1589
1590    @Override
1591    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, ExtendedCell nextIndexedKey,
1592      boolean rewind, ExtendedCell key, boolean seekBefore) throws IOException {
1593      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
1594        updateCurrentBlock(seekToBlock);
1595      } else if (rewind) {
1596        seeker.rewind();
1597      }
1598      this.nextIndexedKey = nextIndexedKey;
1599      return seeker.seekToKeyInBlock(key, seekBefore);
1600    }
1601
1602    @Override
1603    public int compareKey(CellComparator comparator, ExtendedCell key) {
1604      return seeker.compareKey(comparator, key);
1605    }
1606  }
1607
1608  /**
1609   * Returns a buffer with the Bloom filter metadata. The caller takes ownership of the buffer.
1610   */
1611  @Override
1612  public DataInput getGeneralBloomFilterMetadata() throws IOException {
1613    return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1614  }
1615
1616  @Override
1617  public DataInput getDeleteBloomFilterMetadata() throws IOException {
1618    return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1619  }
1620
1621  private DataInput getBloomFilterMetadata(BlockType blockType) throws IOException {
1622    if (
1623      blockType != BlockType.GENERAL_BLOOM_META && blockType != BlockType.DELETE_FAMILY_BLOOM_META
1624    ) {
1625      throw new RuntimeException(
1626        "Block Type: " + blockType.toString() + " is not supported, path=" + path);
1627    }
1628
1629    for (HFileBlock b : fileInfo.getLoadOnOpenBlocks()) {
1630      if (b.getBlockType() == blockType) {
1631        return b.getByteStream();
1632      }
1633    }
1634    return null;
1635  }
1636
1637  public boolean isFileInfoLoaded() {
1638    return true; // We load file info in constructor in version 2.
1639  }
1640
1641  @Override
1642  public HFileContext getFileContext() {
1643    return hfileContext;
1644  }
1645
1646  /**
1647   * Returns false if block prefetching was requested for this file and has not completed, true
1648   * otherwise
1649   */
1650  @Override
1651  public boolean prefetchComplete() {
1652    return PrefetchExecutor.isCompleted(path);
1653  }
1654
1655  /**
1656   * Returns true if block prefetching was started after waiting for specified delay, false
1657   * otherwise
1658   */
1659  @Override
1660  public boolean prefetchStarted() {
1661    return PrefetchExecutor.isPrefetchStarted();
1662  }
1663
1664  /**
1665   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1666   * {@link HFileScanner#seekTo(ExtendedCell)} to position an start the read. There is nothing to
1667   * clean up in a Scanner. Letting go of your references to the scanner is sufficient. NOTE: Do not
1668   * use this overload of getScanner for compactions. See
1669   * {@link #getScanner(Configuration, boolean, boolean, boolean)}
1670   * @param conf        Store configuration.
1671   * @param cacheBlocks True if we should cache blocks read in by this scanner.
1672   * @param pread       Use positional read rather than seek+read if true (pread is better for
1673   *                    random reads, seek+read is better scanning).
1674   * @return Scanner on this file.
1675   */
1676  @Override
1677  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread) {
1678    return getScanner(conf, cacheBlocks, pread, false);
1679  }
1680
1681  /**
1682   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1683   * {@link HFileScanner#seekTo(ExtendedCell)} to position an start the read. There is nothing to
1684   * clean up in a Scanner. Letting go of your references to the scanner is sufficient.
1685   * @param conf         Store configuration.
1686   * @param cacheBlocks  True if we should cache blocks read in by this scanner.
1687   * @param pread        Use positional read rather than seek+read if true (pread is better for
1688   *                     random reads, seek+read is better scanning).
1689   * @param isCompaction is scanner being used for a compaction?
1690   * @return Scanner on this file.
1691   */
1692  @Override
1693  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread,
1694    final boolean isCompaction) {
1695    if (dataBlockEncoder.useEncodedScanner()) {
1696      return new EncodedScanner(this, cacheBlocks, pread, isCompaction, this.hfileContext, conf);
1697    }
1698    return new HFileScannerImpl(this, cacheBlocks, pread, isCompaction);
1699  }
1700
1701  public int getMajorVersion() {
1702    return 3;
1703  }
1704
1705  @Override
1706  public void unbufferStream() {
1707    fsBlockReader.unbufferStream();
1708  }
1709}