001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import java.io.ByteArrayOutputStream;
021import java.io.DataInput;
022import java.io.DataInputStream;
023import java.io.DataOutput;
024import java.io.DataOutputStream;
025import java.io.IOException;
026import java.nio.ByteBuffer;
027import java.util.ArrayList;
028import java.util.Collections;
029import java.util.List;
030import java.util.concurrent.atomic.AtomicReference;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FSDataOutputStream;
033import org.apache.hadoop.hbase.ByteBufferKeyOnlyKeyValue;
034import org.apache.hadoop.hbase.Cell;
035import org.apache.hadoop.hbase.CellComparator;
036import org.apache.hadoop.hbase.CellUtil;
037import org.apache.hadoop.hbase.KeyValue;
038import org.apache.hadoop.hbase.KeyValue.KeyOnlyKeyValue;
039import org.apache.hadoop.hbase.PrivateCellUtil;
040import org.apache.hadoop.hbase.io.HeapSize;
041import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
042import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader;
043import org.apache.hadoop.hbase.nio.ByteBuff;
044import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
045import org.apache.hadoop.hbase.util.Bytes;
046import org.apache.hadoop.hbase.util.ClassSize;
047import org.apache.hadoop.hbase.util.ObjectIntPair;
048import org.apache.hadoop.io.WritableUtils;
049import org.apache.hadoop.util.StringUtils;
050import org.apache.yetus.audience.InterfaceAudience;
051import org.slf4j.Logger;
052import org.slf4j.LoggerFactory;
053
054/**
055 * Provides functionality to write ({@link BlockIndexWriter}) and read BlockIndexReader single-level
056 * and multi-level block indexes. Examples of how to use the block index writer can be found in
057 * {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilterWriter} and {@link HFileWriterImpl}.
058 * Examples of how to use the reader can be found in {@link HFileReaderImpl} and
059 * org.apache.hadoop.hbase.io.hfile.TestHFileBlockIndex.
060 */
061@InterfaceAudience.Private
062public class HFileBlockIndex {
063
064  private static final Logger LOG = LoggerFactory.getLogger(HFileBlockIndex.class);
065
066  static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024;
067
068  /**
069   * The maximum size guideline for index blocks (both leaf, intermediate, and root). If not
070   * specified, <code>DEFAULT_MAX_CHUNK_SIZE</code> is used.
071   */
072  public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size";
073
074  /**
075   * Minimum number of entries in a single index block. Even if we are above the
076   * hfile.index.block.max.size we will keep writing to the same block unless we have that many
077   * entries. We should have at least a few entries so that we don't have too many levels in the
078   * multi-level index. This should be at least 2 to make sure there is no infinite recursion.
079   */
080  public static final String MIN_INDEX_NUM_ENTRIES_KEY = "hfile.index.block.min.entries";
081
082  static final int DEFAULT_MIN_INDEX_NUM_ENTRIES = 16;
083
084  /**
085   * The number of bytes stored in each "secondary index" entry in addition to key bytes in the
086   * non-root index block format. The first long is the file offset of the deeper-level block the
087   * entry points to, and the int that follows is that block's on-disk size without including
088   * header.
089   */
090  static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT + Bytes.SIZEOF_LONG;
091
092  /**
093   * Error message when trying to use inline block API in single-level mode.
094   */
095  private static final String INLINE_BLOCKS_NOT_ALLOWED =
096    "Inline blocks are not allowed in the single-level-only mode";
097
098  /**
099   * The size of a meta-data record used for finding the mid-key in a multi-level index. Consists of
100   * the middle leaf-level index block offset (long), its on-disk size without header included
101   * (int), and the mid-key entry's zero-based index in that leaf index block.
102   */
103  protected static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG + 2 * Bytes.SIZEOF_INT;
104
105  /**
106   * An implementation of the BlockIndexReader that deals with block keys which are plain byte[]
107   * like MetaBlock or the Bloom Block for ROW bloom. Does not need a comparator. It can work on
108   * Bytes.BYTES_RAWCOMPARATOR
109   */
110  static class ByteArrayKeyBlockIndexReader extends BlockIndexReader {
111
112    private byte[][] blockKeys;
113
114    public ByteArrayKeyBlockIndexReader(final int treeLevel) {
115      // Can be null for METAINDEX block
116      searchTreeLevel = treeLevel;
117    }
118
119    @Override
120    protected long calculateHeapSizeForBlockKeys(long heapSize) {
121      // Calculating the size of blockKeys
122      if (blockKeys != null) {
123        heapSize += ClassSize.REFERENCE;
124        // Adding array + references overhead
125        heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE);
126
127        // Adding bytes
128        for (byte[] key : blockKeys) {
129          heapSize += ClassSize.align(ClassSize.ARRAY + key.length);
130        }
131      }
132      return heapSize;
133    }
134
135    @Override
136    public boolean isEmpty() {
137      return blockKeys.length == 0;
138    }
139
140    /**
141     * from 0 to {@link #getRootBlockCount() - 1}
142     */
143    public byte[] getRootBlockKey(int i) {
144      return blockKeys[i];
145    }
146
147    @Override
148    public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
149      boolean cacheBlocks, boolean pread, boolean isCompaction,
150      DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader)
151      throws IOException {
152      // this would not be needed
153      return null;
154    }
155
156    @Override
157    public Cell midkey(CachingBlockReader cachingBlockReader) throws IOException {
158      // Not needed here
159      return null;
160    }
161
162    @Override
163    protected void initialize(int numEntries) {
164      blockKeys = new byte[numEntries][];
165    }
166
167    @Override
168    protected void add(final byte[] key, final long offset, final int dataSize) {
169      blockOffsets[rootCount] = offset;
170      blockKeys[rootCount] = key;
171      blockDataSizes[rootCount] = dataSize;
172      rootCount++;
173    }
174
175    @Override
176    public int rootBlockContainingKey(byte[] key, int offset, int length, CellComparator comp) {
177      int pos = Bytes.binarySearch(blockKeys, key, offset, length);
178      // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
179      // binarySearch's javadoc.
180
181      if (pos >= 0) {
182        // This means this is an exact match with an element of blockKeys.
183        assert pos < blockKeys.length;
184        return pos;
185      }
186
187      // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
188      // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
189      // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
190      // key < blockKeys[0], meaning the file does not contain the given key.
191
192      int i = -pos - 1;
193      assert 0 <= i && i <= blockKeys.length;
194      return i - 1;
195    }
196
197    @Override
198    public int rootBlockContainingKey(Cell key) {
199      // Should not be called on this because here it deals only with byte[]
200      throw new UnsupportedOperationException(
201        "Cannot search for a key that is of Cell type. Only plain byte array keys "
202          + "can be searched for");
203    }
204
205    @Override
206    public String toString() {
207      StringBuilder sb = new StringBuilder();
208      sb.append("size=" + rootCount).append("\n");
209      for (int i = 0; i < rootCount; i++) {
210        sb.append("key=").append(KeyValue.keyToString(blockKeys[i])).append("\n  offset=")
211          .append(blockOffsets[i]).append(", dataSize=" + blockDataSizes[i]).append("\n");
212      }
213      return sb.toString();
214    }
215  }
216
217  /**
218   * An implementation of the BlockIndexReader that deals with block keys which are the key part of
219   * a cell like the Data block index or the ROW_COL bloom blocks This needs a comparator to work
220   * with the Cells
221   */
222  static class CellBasedKeyBlockIndexReader extends BlockIndexReader {
223
224    private Cell[] blockKeys;
225    /** Pre-computed mid-key */
226    private AtomicReference<Cell> midKey = new AtomicReference<>();
227    /** Needed doing lookup on blocks. */
228    protected CellComparator comparator;
229
230    public CellBasedKeyBlockIndexReader(final CellComparator c, final int treeLevel) {
231      // Can be null for METAINDEX block
232      comparator = c;
233      searchTreeLevel = treeLevel;
234    }
235
236    @Override
237    protected long calculateHeapSizeForBlockKeys(long heapSize) {
238      if (blockKeys != null) {
239        heapSize += ClassSize.REFERENCE;
240        // Adding array + references overhead
241        heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE);
242
243        // Adding blockKeys
244        for (Cell key : blockKeys) {
245          heapSize += ClassSize.align(key.heapSize());
246        }
247      }
248      // Add comparator and the midkey atomicreference
249      heapSize += 2 * ClassSize.REFERENCE;
250      return heapSize;
251    }
252
253    @Override
254    public boolean isEmpty() {
255      return blockKeys.length == 0;
256    }
257
258    /**
259     * from 0 to {@link #getRootBlockCount() - 1}
260     */
261    public Cell getRootBlockKey(int i) {
262      return blockKeys[i];
263    }
264
265    @Override
266    public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
267      boolean cacheBlocks, boolean pread, boolean isCompaction,
268      DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader)
269      throws IOException {
270      int rootLevelIndex = rootBlockContainingKey(key);
271      if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) {
272        return null;
273      }
274
275      // the next indexed key
276      Cell nextIndexedKey = null;
277
278      // Read the next-level (intermediate or leaf) index block.
279      long currentOffset = blockOffsets[rootLevelIndex];
280      int currentOnDiskSize = blockDataSizes[rootLevelIndex];
281
282      if (rootLevelIndex < blockKeys.length - 1) {
283        nextIndexedKey = blockKeys[rootLevelIndex + 1];
284      } else {
285        nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY;
286      }
287
288      int lookupLevel = 1; // How many levels deep we are in our lookup.
289      int index = -1;
290
291      HFileBlock block = null;
292      KeyOnlyKeyValue tmpNextIndexKV = new KeyValue.KeyOnlyKeyValue();
293      while (true) {
294        try {
295          // Must initialize it with null here, because if don't and once an exception happen in
296          // readBlock, then we'll release the previous assigned block twice in the finally block.
297          // (See HBASE-22422)
298          block = null;
299          if (currentBlock != null && currentBlock.getOffset() == currentOffset) {
300            // Avoid reading the same block again, even with caching turned off.
301            // This is crucial for compaction-type workload which might have
302            // caching turned off. This is like a one-block cache inside the
303            // scanner.
304            block = currentBlock;
305          } else {
306            // Call HFile's caching block reader API. We always cache index
307            // blocks, otherwise we might get terrible performance.
308            boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel);
309            BlockType expectedBlockType;
310            if (lookupLevel < searchTreeLevel - 1) {
311              expectedBlockType = BlockType.INTERMEDIATE_INDEX;
312            } else if (lookupLevel == searchTreeLevel - 1) {
313              expectedBlockType = BlockType.LEAF_INDEX;
314            } else {
315              // this also accounts for ENCODED_DATA
316              expectedBlockType = BlockType.DATA;
317            }
318            block = cachingBlockReader.readBlock(currentOffset, currentOnDiskSize, shouldCache,
319              pread, isCompaction, true, expectedBlockType, expectedDataBlockEncoding);
320          }
321
322          if (block == null) {
323            throw new IOException("Failed to read block at offset " + currentOffset
324              + ", onDiskSize=" + currentOnDiskSize);
325          }
326
327          // Found a data block, break the loop and check our level in the tree.
328          if (block.getBlockType().isData()) {
329            break;
330          }
331
332          // Not a data block. This must be a leaf-level or intermediate-level
333          // index block. We don't allow going deeper than searchTreeLevel.
334          if (++lookupLevel > searchTreeLevel) {
335            throw new IOException("Search Tree Level overflow: lookupLevel=" + lookupLevel
336              + ", searchTreeLevel=" + searchTreeLevel);
337          }
338
339          // Locate the entry corresponding to the given key in the non-root
340          // (leaf or intermediate-level) index block.
341          ByteBuff buffer = block.getBufferWithoutHeader();
342          index = locateNonRootIndexEntry(buffer, key, comparator);
343          if (index == -1) {
344            // This has to be changed
345            // For now change this to key value
346            throw new IOException("The key " + CellUtil.getCellKeyAsString(key) + " is before the"
347              + " first key of the non-root index block " + block);
348          }
349
350          currentOffset = buffer.getLong();
351          currentOnDiskSize = buffer.getInt();
352
353          // Only update next indexed key if there is a next indexed key in the current level
354          byte[] nonRootIndexedKey = getNonRootIndexedKey(buffer, index + 1);
355          if (nonRootIndexedKey != null) {
356            tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length);
357            nextIndexedKey = tmpNextIndexKV;
358          }
359        } finally {
360          if (block != null && !block.getBlockType().isData()) {
361            // Release the block immediately if it is not the data block
362            block.release();
363          }
364        }
365      }
366
367      if (lookupLevel != searchTreeLevel) {
368        assert block.getBlockType().isData();
369        // Though we have retrieved a data block we have found an issue
370        // in the retrieved data block. Hence returned the block so that
371        // the ref count can be decremented
372        if (block != null) {
373          block.release();
374        }
375        throw new IOException("Reached a data block at level " + lookupLevel
376          + " but the number of levels is " + searchTreeLevel);
377      }
378
379      // set the next indexed key for the current block.
380      return new BlockWithScanInfo(block, nextIndexedKey);
381    }
382
383    @Override
384    public Cell midkey(CachingBlockReader cachingBlockReader) throws IOException {
385      if (rootCount == 0) throw new IOException("HFile empty");
386
387      Cell targetMidKey = this.midKey.get();
388      if (targetMidKey != null) {
389        return targetMidKey;
390      }
391
392      if (midLeafBlockOffset >= 0) {
393        if (cachingBlockReader == null) {
394          throw new IOException(
395            "Have to read the middle leaf block but " + "no block reader available");
396        }
397
398        // Caching, using pread, assuming this is not a compaction.
399        HFileBlock midLeafBlock = cachingBlockReader.readBlock(midLeafBlockOffset,
400          midLeafBlockOnDiskSize, true, true, false, true, BlockType.LEAF_INDEX, null);
401        try {
402          byte[] bytes = getNonRootIndexedKey(midLeafBlock.getBufferWithoutHeader(), midKeyEntry);
403          assert bytes != null;
404          targetMidKey = new KeyValue.KeyOnlyKeyValue(bytes, 0, bytes.length);
405        } finally {
406          midLeafBlock.release();
407        }
408      } else {
409        // The middle of the root-level index.
410        targetMidKey = blockKeys[rootCount / 2];
411      }
412
413      this.midKey.set(targetMidKey);
414      return targetMidKey;
415    }
416
417    @Override
418    protected void initialize(int numEntries) {
419      blockKeys = new Cell[numEntries];
420    }
421
422    /**
423     * Adds a new entry in the root block index. Only used when reading.
424     * @param key      Last key in the block
425     * @param offset   file offset where the block is stored
426     * @param dataSize the uncompressed data size
427     */
428    @Override
429    protected void add(final byte[] key, final long offset, final int dataSize) {
430      blockOffsets[rootCount] = offset;
431      // Create the blockKeys as Cells once when the reader is opened
432      blockKeys[rootCount] = new KeyValue.KeyOnlyKeyValue(key, 0, key.length);
433      blockDataSizes[rootCount] = dataSize;
434      rootCount++;
435    }
436
437    @Override
438    public int rootBlockContainingKey(final byte[] key, int offset, int length,
439      CellComparator comp) {
440      // This should always be called with Cell not with a byte[] key
441      throw new UnsupportedOperationException("Cannot find for a key containing plain byte "
442        + "array. Only cell based keys can be searched for");
443    }
444
445    @Override
446    public int rootBlockContainingKey(Cell key) {
447      // Here the comparator should not be null as this happens for the root-level block
448      int pos = Bytes.binarySearch(blockKeys, key, comparator);
449      // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
450      // binarySearch's javadoc.
451
452      if (pos >= 0) {
453        // This means this is an exact match with an element of blockKeys.
454        assert pos < blockKeys.length;
455        return pos;
456      }
457
458      // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
459      // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
460      // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
461      // key < blockKeys[0], meaning the file does not contain the given key.
462
463      int i = -pos - 1;
464      assert 0 <= i && i <= blockKeys.length;
465      return i - 1;
466    }
467
468    @Override
469    public String toString() {
470      StringBuilder sb = new StringBuilder();
471      sb.append("size=" + rootCount).append("\n");
472      for (int i = 0; i < rootCount; i++) {
473        sb.append("key=").append((blockKeys[i])).append("\n  offset=").append(blockOffsets[i])
474          .append(", dataSize=" + blockDataSizes[i]).append("\n");
475      }
476      return sb.toString();
477    }
478  }
479
480  static class CellBasedKeyBlockIndexReaderV2 extends CellBasedKeyBlockIndexReader {
481
482    private HFileIndexBlockEncoder indexBlockEncoder;
483
484    private HFileIndexBlockEncoder.EncodedSeeker seeker;
485
486    public CellBasedKeyBlockIndexReaderV2(final CellComparator c, final int treeLevel) {
487      this(c, treeLevel, null);
488    }
489
490    public CellBasedKeyBlockIndexReaderV2(final CellComparator c, final int treeLevel,
491      HFileIndexBlockEncoder indexBlockEncoder) {
492      super(c, treeLevel);
493      // Can be null for METAINDEX block
494      this.indexBlockEncoder =
495        indexBlockEncoder != null ? indexBlockEncoder : NoOpIndexBlockEncoder.INSTANCE;
496    }
497
498    @Override
499    public boolean isEmpty() {
500      return seeker.isEmpty();
501    }
502
503    @Override
504    public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
505      boolean cacheBlocks, boolean pread, boolean isCompaction,
506      DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader)
507      throws IOException {
508      return seeker.loadDataBlockWithScanInfo(key, currentBlock, cacheBlocks, pread, isCompaction,
509        expectedDataBlockEncoding, cachingBlockReader);
510    }
511
512    @Override
513    public Cell midkey(CachingBlockReader cachingBlockReader) throws IOException {
514      return seeker.midkey(cachingBlockReader);
515    }
516
517    /**
518     * from 0 to {@link #getRootBlockCount() - 1}
519     */
520    public Cell getRootBlockKey(int i) {
521      return seeker.getRootBlockKey(i);
522    }
523
524    @Override
525    public int getRootBlockCount() {
526      return seeker.getRootBlockCount();
527    }
528
529    @Override
530    public int rootBlockContainingKey(Cell key) {
531      return seeker.rootBlockContainingKey(key);
532    }
533
534    @Override
535    protected long calculateHeapSizeForBlockKeys(long heapSize) {
536      heapSize = super.calculateHeapSizeForBlockKeys(heapSize);
537      if (seeker != null) {
538        heapSize += ClassSize.REFERENCE;
539        heapSize += ClassSize.align(seeker.heapSize());
540      }
541      return heapSize;
542    }
543
544    @Override
545    public void readMultiLevelIndexRoot(HFileBlock blk, final int numEntries) throws IOException {
546      seeker = indexBlockEncoder.createSeeker();
547      seeker.initRootIndex(blk, numEntries, comparator, searchTreeLevel);
548    }
549
550    @Override
551    public String toString() {
552      return seeker.toString();
553    }
554  }
555
556  /**
557   * The reader will always hold the root level index in the memory. Index blocks at all other
558   * levels will be cached in the LRU cache in practice, although this API does not enforce that.
559   * <p>
560   * All non-root (leaf and intermediate) index blocks contain what we call a "secondary index": an
561   * array of offsets to the entries within the block. This allows us to do binary search for the
562   * entry corresponding to the given key without having to deserialize the block.
563   */
564  static abstract class BlockIndexReader implements HeapSize {
565
566    protected long[] blockOffsets;
567    protected int[] blockDataSizes;
568    protected int rootCount = 0;
569
570    // Mid-key metadata.
571    protected long midLeafBlockOffset = -1;
572    protected int midLeafBlockOnDiskSize = -1;
573    protected int midKeyEntry = -1;
574
575    /**
576     * The number of levels in the block index tree. One if there is only root level, two for root
577     * and leaf levels, etc.
578     */
579    protected int searchTreeLevel;
580
581    /** Returns true if the block index is empty. */
582    public abstract boolean isEmpty();
583
584    /**
585     * Verifies that the block index is non-empty and throws an {@link IllegalStateException}
586     * otherwise.
587     */
588    public void ensureNonEmpty() {
589      if (isEmpty()) {
590        throw new IllegalStateException("Block index is empty or not loaded");
591      }
592    }
593
594    /**
595     * Return the data block which contains this key. This function will only be called when the
596     * HFile version is larger than 1.
597     * @param key                       the key we are looking for
598     * @param currentBlock              the current block, to avoid re-reading the same block
599     * @param expectedDataBlockEncoding the data block encoding the caller is expecting the data
600     *                                  block to be in, or null to not perform this check and return
601     *                                  the block irrespective of the encoding
602     * @return reader a basic way to load blocks
603     */
604    public HFileBlock seekToDataBlock(final Cell key, HFileBlock currentBlock, boolean cacheBlocks,
605      boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding,
606      CachingBlockReader cachingBlockReader) throws IOException {
607      BlockWithScanInfo blockWithScanInfo = loadDataBlockWithScanInfo(key, currentBlock,
608        cacheBlocks, pread, isCompaction, expectedDataBlockEncoding, cachingBlockReader);
609      if (blockWithScanInfo == null) {
610        return null;
611      } else {
612        return blockWithScanInfo.getHFileBlock();
613      }
614    }
615
616    /**
617     * Return the BlockWithScanInfo, a data structure which contains the Data HFileBlock with other
618     * scan info such as the key that starts the next HFileBlock. This function will only be called
619     * when the HFile version is larger than 1.
620     * @param key                       the key we are looking for
621     * @param currentBlock              the current block, to avoid re-reading the same block
622     * @param expectedDataBlockEncoding the data block encoding the caller is expecting the data
623     *                                  block to be in, or null to not perform this check and return
624     *                                  the block irrespective of the encoding.
625     * @return the BlockWithScanInfo which contains the DataBlock with other scan info such as
626     *         nextIndexedKey.
627     */
628    public abstract BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
629      boolean cacheBlocks, boolean pread, boolean isCompaction,
630      DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader)
631      throws IOException;
632
633    /**
634     * An approximation to the {@link HFile}'s mid-key. Operates on block boundaries, and does not
635     * go inside blocks. In other words, returns the first key of the middle block of the file.
636     * @return the first key of the middle block
637     */
638    public abstract Cell midkey(CachingBlockReader cachingBlockReader) throws IOException;
639
640    /**
641     * @param i from 0 to {@link #getRootBlockCount() - 1}
642     */
643    public long getRootBlockOffset(int i) {
644      return blockOffsets[i];
645    }
646
647    /**
648     * @param i zero-based index of a root-level block
649     * @return the on-disk size of the root-level block for version 2, or the uncompressed size for
650     *         version 1
651     */
652    public int getRootBlockDataSize(int i) {
653      return blockDataSizes[i];
654    }
655
656    /** Returns the number of root-level blocks in this block index */
657    public int getRootBlockCount() {
658      return rootCount;
659    }
660
661    /**
662     * Finds the root-level index block containing the given key. Key to find the comparator to be
663     * used
664     * @return Offset of block containing <code>key</code> (between 0 and the number of blocks - 1)
665     *         or -1 if this file does not contain the request.
666     */
667    // When we want to find the meta index block or bloom block for ROW bloom
668    // type Bytes.BYTES_RAWCOMPARATOR would be enough. For the ROW_COL bloom case we need the
669    // CellComparator.
670    public abstract int rootBlockContainingKey(final byte[] key, int offset, int length,
671      CellComparator comp);
672
673    /**
674     * Finds the root-level index block containing the given key. Key to find
675     * @return Offset of block containing <code>key</code> (between 0 and the number of blocks - 1)
676     *         or -1 if this file does not contain the request.
677     */
678    // When we want to find the meta index block or bloom block for ROW bloom
679    // type
680    // Bytes.BYTES_RAWCOMPARATOR would be enough. For the ROW_COL bloom case we
681    // need the CellComparator.
682    public int rootBlockContainingKey(final byte[] key, int offset, int length) {
683      return rootBlockContainingKey(key, offset, length, null);
684    }
685
686    /**
687     * Finds the root-level index block containing the given key. Key to find
688     */
689    public abstract int rootBlockContainingKey(final Cell key);
690
691    /**
692     * The indexed key at the ith position in the nonRootIndex. The position starts at 0.
693     * @param i the ith position
694     * @return The indexed key at the ith position in the nonRootIndex.
695     */
696    static byte[] getNonRootIndexedKey(ByteBuff nonRootIndex, int i) {
697      int numEntries = nonRootIndex.getInt(0);
698      if (i < 0 || i >= numEntries) {
699        return null;
700      }
701
702      // Entries start after the number of entries and the secondary index.
703      // The secondary index takes numEntries + 1 ints.
704      int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
705      // Targetkey's offset relative to the end of secondary index
706      int targetKeyRelOffset = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 1));
707
708      // The offset of the target key in the blockIndex buffer
709      int targetKeyOffset = entriesOffset // Skip secondary index
710        + targetKeyRelOffset // Skip all entries until mid
711        + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size
712
713      // We subtract the two consecutive secondary index elements, which
714      // gives us the size of the whole (offset, onDiskSize, key) tuple. We
715      // then need to subtract the overhead of offset and onDiskSize.
716      int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) - targetKeyRelOffset
717        - SECONDARY_INDEX_ENTRY_OVERHEAD;
718
719      // TODO check whether we can make BB backed Cell here? So can avoid bytes copy.
720      return nonRootIndex.toBytes(targetKeyOffset, targetKeyLength);
721    }
722
723    /**
724     * Performs a binary search over a non-root level index block. Utilizes the secondary index,
725     * which records the offsets of (offset, onDiskSize, firstKey) tuples of all entries. the key we
726     * are searching for offsets to individual entries in the blockIndex buffer the non-root index
727     * block buffer, starting with the secondary index. The position is ignored.
728     * @return the index i in [0, numEntries - 1] such that keys[i] <= key < keys[i + 1], if keys is
729     *         the array of all keys being searched, or -1 otherwise
730     */
731    static int binarySearchNonRootIndex(Cell key, ByteBuff nonRootIndex,
732      CellComparator comparator) {
733
734      int numEntries = nonRootIndex.getIntAfterPosition(0);
735      int low = 0;
736      int high = numEntries - 1;
737      int mid = 0;
738
739      // Entries start after the number of entries and the secondary index.
740      // The secondary index takes numEntries + 1 ints.
741      int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
742
743      // If we imagine that keys[-1] = -Infinity and
744      // keys[numEntries] = Infinity, then we are maintaining an invariant that
745      // keys[low - 1] < key < keys[high + 1] while narrowing down the range.
746      ByteBufferKeyOnlyKeyValue nonRootIndexkeyOnlyKV = new ByteBufferKeyOnlyKeyValue();
747      ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<>();
748      while (low <= high) {
749        mid = low + ((high - low) >> 1);
750
751        // Midkey's offset relative to the end of secondary index
752        int midKeyRelOffset = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 1));
753
754        // The offset of the middle key in the blockIndex buffer
755        int midKeyOffset = entriesOffset // Skip secondary index
756          + midKeyRelOffset // Skip all entries until mid
757          + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size
758
759        // We subtract the two consecutive secondary index elements, which
760        // gives us the size of the whole (offset, onDiskSize, key) tuple. We
761        // then need to subtract the overhead of offset and onDiskSize.
762        int midLength = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 2))
763          - midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
764
765        // we have to compare in this order, because the comparator order
766        // has special logic when the 'left side' is a special key.
767        // TODO make KeyOnlyKeyValue to be Buffer backed and avoid array() call. This has to be
768        // done after HBASE-12224 & HBASE-12282
769        // TODO avoid array call.
770        nonRootIndex.asSubByteBuffer(midKeyOffset, midLength, pair);
771        nonRootIndexkeyOnlyKV.setKey(pair.getFirst(), pair.getSecond(), midLength);
772        int cmp = PrivateCellUtil.compareKeyIgnoresMvcc(comparator, key, nonRootIndexkeyOnlyKV);
773
774        // key lives above the midpoint
775        if (cmp > 0) low = mid + 1; // Maintain the invariant that keys[low - 1] < key
776        // key lives below the midpoint
777        else if (cmp < 0) high = mid - 1; // Maintain the invariant that key < keys[high + 1]
778        else return mid; // exact match
779      }
780
781      // As per our invariant, keys[low - 1] < key < keys[high + 1], meaning
782      // that low - 1 < high + 1 and (low - high) <= 1. As per the loop break
783      // condition, low >= high + 1. Therefore, low = high + 1.
784
785      if (low != high + 1) {
786        throw new IllegalStateException(
787          "Binary search broken: low=" + low + " " + "instead of " + (high + 1));
788      }
789
790      // OK, our invariant says that keys[low - 1] < key < keys[low]. We need to
791      // return i such that keys[i] <= key < keys[i + 1]. Therefore i = low - 1.
792      int i = low - 1;
793
794      // Some extra validation on the result.
795      if (i < -1 || i >= numEntries) {
796        throw new IllegalStateException("Binary search broken: result is " + i
797          + " but expected to be between -1 and (numEntries - 1) = " + (numEntries - 1));
798      }
799
800      return i;
801    }
802
803    /**
804     * Search for one key using the secondary index in a non-root block. In case of success,
805     * positions the provided buffer at the entry of interest, where the file offset and the
806     * on-disk-size can be read. a non-root block without header. Initial position does not matter.
807     * the byte array containing the key
808     * @return the index position where the given key was found, otherwise return -1 in the case the
809     *         given key is before the first key.
810     */
811    static int locateNonRootIndexEntry(ByteBuff nonRootBlock, Cell key, CellComparator comparator) {
812      int entryIndex = binarySearchNonRootIndex(key, nonRootBlock, comparator);
813
814      if (entryIndex != -1) {
815        int numEntries = nonRootBlock.getIntAfterPosition(0);
816
817        // The end of secondary index and the beginning of entries themselves.
818        int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
819
820        // The offset of the entry we are interested in relative to the end of
821        // the secondary index.
822        int entryRelOffset = nonRootBlock.getIntAfterPosition(Bytes.SIZEOF_INT * (1 + entryIndex));
823
824        nonRootBlock.position(entriesOffset + entryRelOffset);
825      }
826
827      return entryIndex;
828    }
829
830    /**
831     * Read in the root-level index from the given input stream. Must match what was written into
832     * the root level by {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the offset
833     * that function returned.
834     * @param in         the buffered input stream or wrapped byte input stream
835     * @param numEntries the number of root-level index entries
836     */
837    public void readRootIndex(DataInput in, final int numEntries) throws IOException {
838      blockOffsets = new long[numEntries];
839      initialize(numEntries);
840      blockDataSizes = new int[numEntries];
841
842      // If index size is zero, no index was written.
843      if (numEntries > 0) {
844        for (int i = 0; i < numEntries; ++i) {
845          long offset = in.readLong();
846          int dataSize = in.readInt();
847          byte[] key = Bytes.readByteArray(in);
848          add(key, offset, dataSize);
849        }
850      }
851    }
852
853    protected abstract void initialize(int numEntries);
854
855    protected abstract void add(final byte[] key, final long offset, final int dataSize);
856
857    /**
858     * Read in the root-level index from the given input stream. Must match what was written into
859     * the root level by {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the offset
860     * that function returned.
861     * @param blk        the HFile block
862     * @param numEntries the number of root-level index entries
863     * @return the buffered input stream or wrapped byte input stream
864     */
865    public DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException {
866      DataInputStream in = blk.getByteStream();
867      readRootIndex(in, numEntries);
868      return in;
869    }
870
871    /**
872     * Read the root-level metadata of a multi-level block index. Based on
873     * {@link #readRootIndex(DataInput, int)}, but also reads metadata necessary to compute the
874     * mid-key in a multi-level index.
875     * @param blk        the HFile block
876     * @param numEntries the number of root-level index entries
877     */
878    public void readMultiLevelIndexRoot(HFileBlock blk, final int numEntries) throws IOException {
879      DataInputStream in = readRootIndex(blk, numEntries);
880      // HFileBlock.getByteStream() returns a byte stream for reading the data(excluding checksum)
881      // of root index block, so after reading the root index there is no need to subtract the
882      // checksum bytes.
883      if (in.available() < MID_KEY_METADATA_SIZE) {
884        // No mid-key metadata available.
885        return;
886      }
887      midLeafBlockOffset = in.readLong();
888      midLeafBlockOnDiskSize = in.readInt();
889      midKeyEntry = in.readInt();
890    }
891
892    @Override
893    public long heapSize() {
894      // The BlockIndexReader does not have the blockKey, comparator and the midkey atomic reference
895      long heapSize =
896        ClassSize.align(3 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT + ClassSize.OBJECT);
897
898      // Mid-key metadata.
899      heapSize += MID_KEY_METADATA_SIZE;
900
901      heapSize = calculateHeapSizeForBlockKeys(heapSize);
902
903      if (blockOffsets != null) {
904        heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length * Bytes.SIZEOF_LONG);
905      }
906
907      if (blockDataSizes != null) {
908        heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length * Bytes.SIZEOF_INT);
909      }
910
911      return ClassSize.align(heapSize);
912    }
913
914    protected abstract long calculateHeapSizeForBlockKeys(long heapSize);
915  }
916
917  /**
918   * Writes the block index into the output stream. Generate the tree from bottom up. The leaf level
919   * is written to disk as a sequence of inline blocks, if it is larger than a certain number of
920   * bytes. If the leaf level is not large enough, we write all entries to the root level instead.
921   * After all leaf blocks have been written, we end up with an index referencing the resulting leaf
922   * index blocks. If that index is larger than the allowed root index size, the writer will break
923   * it up into reasonable-size intermediate-level index block chunks write those chunks out, and
924   * create another index referencing those chunks. This will be repeated until the remaining index
925   * is small enough to become the root index. However, in most practical cases we will only have
926   * leaf-level blocks and the root index, or just the root index.
927   */
928  public static class BlockIndexWriter implements InlineBlockWriter {
929    /**
930     * While the index is being written, this represents the current block index referencing all
931     * leaf blocks, with one exception. If the file is being closed and there are not enough blocks
932     * to complete even a single leaf block, no leaf blocks get written and this contains the entire
933     * block index. After all levels of the index were written by
934     * {@link #writeIndexBlocks(FSDataOutputStream)}, this contains the final root-level index.
935     */
936    private BlockIndexChunk rootChunk = new BlockIndexChunkImpl();
937
938    /**
939     * Current leaf-level chunk. New entries referencing data blocks get added to this chunk until
940     * it grows large enough to be written to disk.
941     */
942    private BlockIndexChunk curInlineChunk = new BlockIndexChunkImpl();
943
944    /**
945     * The number of block index levels. This is one if there is only root level (even empty), two
946     * if there a leaf level and root level, and is higher if there are intermediate levels. This is
947     * only final after {@link #writeIndexBlocks(FSDataOutputStream)} has been called. The initial
948     * value accounts for the root level, and will be increased to two as soon as we find out there
949     * is a leaf-level in {@link #blockWritten(long, int, int)}.
950     */
951    private int numLevels = 1;
952
953    private HFileBlock.Writer blockWriter;
954    private byte[] firstKey = null;
955
956    /**
957     * The total number of leaf-level entries, i.e. entries referenced by leaf-level blocks. For the
958     * data block index this is equal to the number of data blocks.
959     */
960    private long totalNumEntries;
961
962    /** Total compressed size of all index blocks. */
963    private long totalBlockOnDiskSize;
964
965    /** Total uncompressed size of all index blocks. */
966    private long totalBlockUncompressedSize;
967
968    /** The maximum size guideline of all multi-level index blocks. */
969    private int maxChunkSize;
970
971    /** The maximum level of multi-level index blocks */
972    private int minIndexNumEntries;
973
974    /** Whether we require this block index to always be single-level. */
975    private boolean singleLevelOnly;
976
977    /** CacheConfig, or null if cache-on-write is disabled */
978    private CacheConfig cacheConf;
979
980    /** Name to use for computing cache keys */
981    private String nameForCaching;
982
983    /** Type of encoding used for index blocks in HFile */
984    private HFileIndexBlockEncoder indexBlockEncoder;
985
986    /** Creates a single-level block index writer */
987    public BlockIndexWriter() {
988      this(null, null, null, null);
989      singleLevelOnly = true;
990    }
991
992    /**
993     * Creates a multi-level block index writer.
994     * @param blockWriter the block writer to use to write index blocks
995     * @param cacheConf   used to determine when and how a block should be cached-on-write.
996     */
997    public BlockIndexWriter(HFileBlock.Writer blockWriter, CacheConfig cacheConf,
998      String nameForCaching, HFileIndexBlockEncoder indexBlockEncoder) {
999      if ((cacheConf == null) != (nameForCaching == null)) {
1000        throw new IllegalArgumentException(
1001          "Block cache and file name for " + "caching must be both specified or both null");
1002      }
1003
1004      this.blockWriter = blockWriter;
1005      this.cacheConf = cacheConf;
1006      this.nameForCaching = nameForCaching;
1007      this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE;
1008      this.minIndexNumEntries = HFileBlockIndex.DEFAULT_MIN_INDEX_NUM_ENTRIES;
1009      this.indexBlockEncoder =
1010        indexBlockEncoder != null ? indexBlockEncoder : NoOpIndexBlockEncoder.INSTANCE;
1011    }
1012
1013    public void setMaxChunkSize(int maxChunkSize) {
1014      if (maxChunkSize <= 0) {
1015        throw new IllegalArgumentException("Invalid maximum index block size");
1016      }
1017      this.maxChunkSize = maxChunkSize;
1018    }
1019
1020    public void setMinIndexNumEntries(int minIndexNumEntries) {
1021      if (minIndexNumEntries <= 1) {
1022        throw new IllegalArgumentException("Invalid maximum index level, should be >= 2");
1023      }
1024      this.minIndexNumEntries = minIndexNumEntries;
1025    }
1026
1027    /**
1028     * Writes the root level and intermediate levels of the block index into the output stream,
1029     * generating the tree from bottom up. Assumes that the leaf level has been inline-written to
1030     * the disk if there is enough data for more than one leaf block. We iterate by breaking the
1031     * current level of the block index, starting with the index of all leaf-level blocks, into
1032     * chunks small enough to be written to disk, and generate its parent level, until we end up
1033     * with a level small enough to become the root level. If the leaf level is not large enough,
1034     * there is no inline block index anymore, so we only write that level of block index to disk as
1035     * the root level.
1036     * @param out FSDataOutputStream
1037     * @return position at which we entered the root-level index.
1038     */
1039    public long writeIndexBlocks(FSDataOutputStream out) throws IOException {
1040      if (curInlineChunk != null && curInlineChunk.getNumEntries() != 0) {
1041        throw new IOException("Trying to write a multi-level block index, " + "but are "
1042          + curInlineChunk.getNumEntries() + " entries in the " + "last inline chunk.");
1043      }
1044
1045      // We need to get mid-key metadata before we create intermediate
1046      // indexes and overwrite the root chunk.
1047      byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata() : null;
1048
1049      if (curInlineChunk != null) {
1050        while (
1051          rootChunk.getRootSize() > maxChunkSize
1052            // HBASE-16288: if firstKey is larger than maxChunkSize we will loop indefinitely
1053            && rootChunk.getNumEntries() > minIndexNumEntries
1054            // Sanity check. We will not hit this (minIndexNumEntries ^ 16) blocks can be addressed
1055            && numLevels < 16
1056        ) {
1057          rootChunk = writeIntermediateLevel(out, rootChunk);
1058          numLevels += 1;
1059        }
1060      }
1061
1062      // write the root level
1063      long rootLevelIndexPos = out.getPos();
1064
1065      {
1066        DataOutput blockStream = blockWriter.startWriting(BlockType.ROOT_INDEX);
1067        indexBlockEncoder.encode(rootChunk, true, blockStream);
1068        if (midKeyMetadata != null) blockStream.write(midKeyMetadata);
1069        blockWriter.writeHeaderAndData(out);
1070        if (cacheConf != null) {
1071          cacheConf.getBlockCache().ifPresent(cache -> {
1072            HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf);
1073            cache.cacheBlock(new BlockCacheKey(nameForCaching, rootLevelIndexPos, true,
1074              blockForCaching.getBlockType()), blockForCaching);
1075          });
1076        }
1077      }
1078
1079      // Add root index block size
1080      totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
1081      totalBlockUncompressedSize += blockWriter.getUncompressedSizeWithoutHeader();
1082
1083      if (LOG.isTraceEnabled()) {
1084        LOG.trace("Wrote a " + numLevels + "-level index with root level at pos "
1085          + rootLevelIndexPos + ", " + rootChunk.getNumEntries() + " root-level entries, "
1086          + totalNumEntries + " total entries, "
1087          + StringUtils.humanReadableInt(this.totalBlockOnDiskSize) + " on-disk size, "
1088          + StringUtils.humanReadableInt(totalBlockUncompressedSize) + " total uncompressed size.");
1089      }
1090      return rootLevelIndexPos;
1091    }
1092
1093    /**
1094     * Writes the block index data as a single level only. Does not do any block framing.
1095     * @param out         the buffered output stream to write the index to. Typically a stream
1096     *                    writing into an {@link HFile} block.
1097     * @param description a short description of the index being written. Used in a log message.
1098     */
1099    public void writeSingleLevelIndex(DataOutput out, String description) throws IOException {
1100      expectNumLevels(1);
1101
1102      if (!singleLevelOnly) throw new IOException("Single-level mode is turned off");
1103
1104      if (rootChunk.getNumEntries() > 0)
1105        throw new IOException("Root-level entries already added in " + "single-level mode");
1106
1107      rootChunk = curInlineChunk;
1108      curInlineChunk = new BlockIndexChunkImpl();
1109
1110      if (LOG.isTraceEnabled()) {
1111        LOG.trace("Wrote a single-level " + description + " index with " + rootChunk.getNumEntries()
1112          + " entries, " + rootChunk.getRootSize() + " bytes");
1113      }
1114      indexBlockEncoder.encode(rootChunk, true, out);
1115    }
1116
1117    /**
1118     * Split the current level of the block index into intermediate index blocks of permitted size
1119     * and write those blocks to disk. Return the next level of the block index referencing those
1120     * intermediate-level blocks.
1121     * @param currentLevel the current level of the block index, such as the a chunk referencing all
1122     *                     leaf-level index blocks
1123     * @return the parent level block index, which becomes the root index after a few (usually zero)
1124     *         iterations
1125     */
1126    private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out,
1127      BlockIndexChunk currentLevel) throws IOException {
1128      // Entries referencing intermediate-level blocks we are about to create.
1129      BlockIndexChunk parent = new BlockIndexChunkImpl();
1130
1131      // The current intermediate-level block index chunk.
1132      BlockIndexChunk curChunk = new BlockIndexChunkImpl();
1133
1134      for (int i = 0; i < currentLevel.getNumEntries(); ++i) {
1135        curChunk.add(currentLevel.getBlockKey(i), currentLevel.getBlockOffset(i),
1136          currentLevel.getOnDiskDataSize(i));
1137
1138        // HBASE-16288: We have to have at least minIndexNumEntries(16) items in the index so that
1139        // we won't end up with too-many levels for a index with very large rowKeys. Also, if the
1140        // first key is larger than maxChunkSize this will cause infinite recursion.
1141        if (i >= minIndexNumEntries && curChunk.getRootSize() >= maxChunkSize) {
1142          writeIntermediateBlock(out, parent, curChunk);
1143        }
1144      }
1145
1146      if (curChunk.getNumEntries() > 0) {
1147        writeIntermediateBlock(out, parent, curChunk);
1148      }
1149
1150      return parent;
1151    }
1152
1153    private void writeIntermediateBlock(FSDataOutputStream out, BlockIndexChunk parent,
1154      BlockIndexChunk curChunk) throws IOException {
1155      long beginOffset = out.getPos();
1156      DataOutputStream dos = blockWriter.startWriting(BlockType.INTERMEDIATE_INDEX);
1157      indexBlockEncoder.encode(curChunk, false, dos);
1158      byte[] curFirstKey = curChunk.getBlockKey(0);
1159      blockWriter.writeHeaderAndData(out);
1160
1161      if (getCacheOnWrite()) {
1162        cacheConf.getBlockCache().ifPresent(cache -> {
1163          HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf);
1164          cache.cacheBlock(
1165            new BlockCacheKey(nameForCaching, beginOffset, true, blockForCaching.getBlockType()),
1166            blockForCaching);
1167        });
1168      }
1169
1170      // Add intermediate index block size
1171      totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
1172      totalBlockUncompressedSize += blockWriter.getUncompressedSizeWithoutHeader();
1173
1174      // OFFSET is the beginning offset the chunk of block index entries.
1175      // SIZE is the total byte size of the chunk of block index entries
1176      // + the secondary index size
1177      // FIRST_KEY is the first key in the chunk of block index
1178      // entries.
1179      parent.add(curFirstKey, beginOffset, blockWriter.getOnDiskSizeWithHeader());
1180
1181      // clear current block index chunk
1182      curChunk.clear();
1183      curFirstKey = null;
1184    }
1185
1186    /** Returns how many block index entries there are in the root level */
1187    public final int getNumRootEntries() {
1188      return rootChunk.getNumEntries();
1189    }
1190
1191    /** Returns the number of levels in this block index. */
1192    public int getNumLevels() {
1193      return numLevels;
1194    }
1195
1196    private void expectNumLevels(int expectedNumLevels) {
1197      if (numLevels != expectedNumLevels) {
1198        throw new IllegalStateException("Number of block index levels is " + numLevels
1199          + "but is expected to be " + expectedNumLevels);
1200      }
1201    }
1202
1203    /**
1204     * Whether there is an inline block ready to be written. In general, we write an leaf-level
1205     * index block as an inline block as soon as its size as serialized in the non-root format
1206     * reaches a certain threshold.
1207     */
1208    @Override
1209    public boolean shouldWriteBlock(boolean closing) {
1210      if (singleLevelOnly) {
1211        throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1212      }
1213
1214      if (curInlineChunk == null) {
1215        throw new IllegalStateException("curInlineChunk is null; has shouldWriteBlock been "
1216          + "called with closing=true and then called again?");
1217      }
1218
1219      if (curInlineChunk.getNumEntries() == 0) {
1220        return false;
1221      }
1222
1223      // We do have some entries in the current inline chunk.
1224      if (closing) {
1225        if (rootChunk.getNumEntries() == 0) {
1226          // We did not add any leaf-level blocks yet. Instead of creating a
1227          // leaf level with one block, move these entries to the root level.
1228
1229          expectNumLevels(1);
1230          rootChunk = curInlineChunk;
1231          curInlineChunk = null; // Disallow adding any more index entries.
1232          return false;
1233        }
1234
1235        return true;
1236      } else {
1237        return curInlineChunk.getNonRootSize() >= maxChunkSize;
1238      }
1239    }
1240
1241    /**
1242     * Write out the current inline index block. Inline blocks are non-root blocks, so the non-root
1243     * index format is used.
1244     */
1245    @Override
1246    public void writeInlineBlock(DataOutput out) throws IOException {
1247      if (singleLevelOnly) throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1248
1249      // Write the inline block index to the output stream in the non-root
1250      // index block format.
1251      indexBlockEncoder.encode(curInlineChunk, false, out);
1252
1253      // Save the first key of the inline block so that we can add it to the
1254      // parent-level index.
1255      firstKey = curInlineChunk.getBlockKey(0);
1256
1257      // Start a new inline index block
1258      curInlineChunk.clear();
1259    }
1260
1261    /**
1262     * Called after an inline block has been written so that we can add an entry referring to that
1263     * block to the parent-level index.
1264     */
1265    @Override
1266    public void blockWritten(long offset, int onDiskSize, int uncompressedSize) {
1267      // Add leaf index block size
1268      totalBlockOnDiskSize += onDiskSize;
1269      totalBlockUncompressedSize += uncompressedSize;
1270
1271      if (singleLevelOnly) throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1272
1273      if (firstKey == null) {
1274        throw new IllegalStateException(
1275          "Trying to add second-level index " + "entry with offset=" + offset + " and onDiskSize="
1276            + onDiskSize + "but the first key was not set in writeInlineBlock");
1277      }
1278
1279      if (rootChunk.getNumEntries() == 0) {
1280        // We are writing the first leaf block, so increase index level.
1281        expectNumLevels(1);
1282        numLevels = 2;
1283      }
1284
1285      // Add another entry to the second-level index. Include the number of
1286      // entries in all previous leaf-level chunks for mid-key calculation.
1287      rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries);
1288      firstKey = null;
1289    }
1290
1291    @Override
1292    public BlockType getInlineBlockType() {
1293      return BlockType.LEAF_INDEX;
1294    }
1295
1296    /**
1297     * Add one index entry to the current leaf-level block. When the leaf-level block gets large
1298     * enough, it will be flushed to disk as an inline block.
1299     * @param firstKey      the first key of the data block
1300     * @param blockOffset   the offset of the data block
1301     * @param blockDataSize the on-disk size of the data block ({@link HFile} format version 2), or
1302     *                      the uncompressed size of the data block ( {@link HFile} format version
1303     *                      1).
1304     */
1305    public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize) {
1306      curInlineChunk.add(firstKey, blockOffset, blockDataSize);
1307      ++totalNumEntries;
1308    }
1309
1310    /**
1311     * @throws IOException if we happened to write a multi-level index.
1312     */
1313    public void ensureSingleLevel() throws IOException {
1314      if (numLevels > 1) {
1315        throw new IOException(
1316          "Wrote a " + numLevels + "-level index with " + rootChunk.getNumEntries()
1317            + " root-level entries, but " + "this is expected to be a single-level block index.");
1318      }
1319    }
1320
1321    /**
1322     * @return true if we are using cache-on-write. This is configured by the caller of the
1323     *         constructor by either passing a valid block cache or null.
1324     */
1325    @Override
1326    public boolean getCacheOnWrite() {
1327      return cacheConf != null && cacheConf.shouldCacheIndexesOnWrite();
1328    }
1329
1330    /**
1331     * The total uncompressed size of the root index block, intermediate-level index blocks, and
1332     * leaf-level index blocks.
1333     * @return the total uncompressed size of all index blocks
1334     */
1335    public long getTotalUncompressedSize() {
1336      return totalBlockUncompressedSize;
1337    }
1338
1339  }
1340
1341  /**
1342   * A single chunk of the block index in the process of writing. The data in this chunk can become
1343   * a leaf-level, intermediate-level, or root index block.
1344   */
1345  static class BlockIndexChunkImpl implements BlockIndexChunk {
1346
1347    /** First keys of the key range corresponding to each index entry. */
1348    private final List<byte[]> blockKeys = new ArrayList<>();
1349
1350    /** Block offset in backing stream. */
1351    private final List<Long> blockOffsets = new ArrayList<>();
1352
1353    /** On-disk data sizes of lower-level data or index blocks. */
1354    private final List<Integer> onDiskDataSizes = new ArrayList<>();
1355
1356    /**
1357     * The cumulative number of sub-entries, i.e. entries on deeper-level block index entries.
1358     * numSubEntriesAt[i] is the number of sub-entries in the blocks corresponding to this chunk's
1359     * entries #0 through #i inclusively.
1360     */
1361    private final List<Long> numSubEntriesAt = new ArrayList<>();
1362
1363    /**
1364     * The offset of the next entry to be added, relative to the end of the "secondary index" in the
1365     * "non-root" format representation of this index chunk. This is the next value to be added to
1366     * the secondary index.
1367     */
1368    private int curTotalNonRootEntrySize = 0;
1369
1370    /**
1371     * The accumulated size of this chunk if stored in the root index format.
1372     */
1373    private int curTotalRootSize = 0;
1374
1375    /**
1376     * The "secondary index" used for binary search over variable-length records in a "non-root"
1377     * format block. These offsets are relative to the end of this secondary index.
1378     */
1379    private final List<Integer> secondaryIndexOffsetMarks = new ArrayList<>();
1380
1381    /**
1382     * Adds a new entry to this block index chunk.
1383     * @param firstKey              the first key in the block pointed to by this entry
1384     * @param blockOffset           the offset of the next-level block pointed to by this entry
1385     * @param onDiskDataSize        the on-disk data of the block pointed to by this entry,
1386     *                              including header size
1387     * @param curTotalNumSubEntries if this chunk is the root index chunk under construction, this
1388     *                              specifies the current total number of sub-entries in all
1389     *                              leaf-level chunks, including the one corresponding to the
1390     *                              second-level entry being added.
1391     */
1392    @Override
1393    public void add(byte[] firstKey, long blockOffset, int onDiskDataSize,
1394      long curTotalNumSubEntries) {
1395      // Record the offset for the secondary index
1396      secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize);
1397      curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD + firstKey.length;
1398
1399      curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT
1400        + WritableUtils.getVIntSize(firstKey.length) + firstKey.length;
1401
1402      blockKeys.add(firstKey);
1403      blockOffsets.add(blockOffset);
1404      onDiskDataSizes.add(onDiskDataSize);
1405
1406      if (curTotalNumSubEntries != -1) {
1407        numSubEntriesAt.add(curTotalNumSubEntries);
1408
1409        // Make sure the parallel arrays are in sync.
1410        if (numSubEntriesAt.size() != blockKeys.size()) {
1411          throw new IllegalStateException("Only have key/value count " + "stats for "
1412            + numSubEntriesAt.size() + " block index " + "entries out of " + blockKeys.size());
1413        }
1414      }
1415    }
1416
1417    /**
1418     * The same as {@link #add(byte[], long, int, long)} but does not take the key/value into
1419     * account. Used for single-level indexes.
1420     * @see #add(byte[], long, int, long)
1421     */
1422    @Override
1423    public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) {
1424      add(firstKey, blockOffset, onDiskDataSize, -1);
1425    }
1426
1427    @Override
1428    public void clear() {
1429      blockKeys.clear();
1430      blockOffsets.clear();
1431      onDiskDataSizes.clear();
1432      secondaryIndexOffsetMarks.clear();
1433      numSubEntriesAt.clear();
1434      curTotalNonRootEntrySize = 0;
1435      curTotalRootSize = 0;
1436    }
1437
1438    /**
1439     * Finds the entry corresponding to the deeper-level index block containing the given
1440     * deeper-level entry (a "sub-entry"), assuming a global 0-based ordering of sub-entries.
1441     * <p>
1442     * <i> Implementation note. </i> We are looking for i such that numSubEntriesAt[i - 1] <= k <
1443     * numSubEntriesAt[i], because a deeper-level block #i (0-based) contains sub-entries #
1444     * numSubEntriesAt[i - 1]'th through numSubEntriesAt[i] - 1, assuming a global 0-based ordering
1445     * of sub-entries. i is by definition the insertion point of k in numSubEntriesAt.
1446     * @param k sub-entry index, from 0 to the total number sub-entries - 1
1447     * @return the 0-based index of the entry corresponding to the given sub-entry
1448     */
1449    @Override
1450    public int getEntryBySubEntry(long k) {
1451      // We define mid-key as the key corresponding to k'th sub-entry
1452      // (0-based).
1453
1454      int i = Collections.binarySearch(numSubEntriesAt, k);
1455
1456      // Exact match: cumulativeWeight[i] = k. This means chunks #0 through
1457      // #i contain exactly k sub-entries, and the sub-entry #k (0-based)
1458      // is in the (i + 1)'th chunk.
1459      if (i >= 0) return i + 1;
1460
1461      // Inexact match. Return the insertion point.
1462      return -i - 1;
1463    }
1464
1465    /**
1466     * Used when writing the root block index of a multi-level block index. Serializes additional
1467     * information allowing to efficiently identify the mid-key.
1468     * @return a few serialized fields for finding the mid-key
1469     * @throws IOException if could not create metadata for computing mid-key
1470     */
1471    @Override
1472    public byte[] getMidKeyMetadata() throws IOException {
1473      ByteArrayOutputStream baos = new ByteArrayOutputStream(MID_KEY_METADATA_SIZE);
1474      DataOutputStream baosDos = new DataOutputStream(baos);
1475      long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1);
1476      if (totalNumSubEntries == 0) {
1477        throw new IOException("No leaf-level entries, mid-key unavailable");
1478      }
1479      long midKeySubEntry = (totalNumSubEntries - 1) / 2;
1480      int midKeyEntry = getEntryBySubEntry(midKeySubEntry);
1481
1482      baosDos.writeLong(blockOffsets.get(midKeyEntry));
1483      baosDos.writeInt(onDiskDataSizes.get(midKeyEntry));
1484
1485      long numSubEntriesBefore = midKeyEntry > 0 ? numSubEntriesAt.get(midKeyEntry - 1) : 0;
1486      long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore;
1487      if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE) {
1488        throw new IOException("Could not identify mid-key index within the "
1489          + "leaf-level block containing mid-key: out of range (" + subEntryWithinEntry
1490          + ", numSubEntriesBefore=" + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry
1491          + ")");
1492      }
1493
1494      baosDos.writeInt((int) subEntryWithinEntry);
1495
1496      if (baosDos.size() != MID_KEY_METADATA_SIZE) {
1497        throw new IOException("Could not write mid-key metadata: size=" + baosDos.size()
1498          + ", correct size: " + MID_KEY_METADATA_SIZE);
1499      }
1500
1501      // Close just to be good citizens, although this has no effect.
1502      baos.close();
1503
1504      return baos.toByteArray();
1505    }
1506
1507    /** Returns the size of this chunk if stored in the non-root index block format */
1508    @Override
1509    public int getNonRootSize() {
1510      return Bytes.SIZEOF_INT // Number of entries
1511        + Bytes.SIZEOF_INT * (blockKeys.size() + 1) // Secondary index
1512        + curTotalNonRootEntrySize; // All entries
1513    }
1514
1515    @Override
1516    public int getCurTotalNonRootEntrySize() {
1517      return curTotalNonRootEntrySize;
1518    }
1519
1520    @Override
1521    public List<byte[]> getBlockKeys() {
1522      return blockKeys;
1523    }
1524
1525    @Override
1526    public List<Integer> getSecondaryIndexOffsetMarks() {
1527      return secondaryIndexOffsetMarks;
1528    }
1529
1530    /** Returns the size of this chunk if stored in the root index block format */
1531    @Override
1532    public int getRootSize() {
1533      return curTotalRootSize;
1534    }
1535
1536    /** Returns the number of entries in this block index chunk */
1537    public int getNumEntries() {
1538      return blockKeys.size();
1539    }
1540
1541    public byte[] getBlockKey(int i) {
1542      return blockKeys.get(i);
1543    }
1544
1545    public long getBlockOffset(int i) {
1546      return blockOffsets.get(i);
1547    }
1548
1549    public int getOnDiskDataSize(int i) {
1550      return onDiskDataSizes.get(i);
1551    }
1552
1553    public long getCumulativeNumKV(int i) {
1554      if (i < 0) return 0;
1555      return numSubEntriesAt.get(i);
1556    }
1557
1558  }
1559
1560  public static int getMaxChunkSize(Configuration conf) {
1561    return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE);
1562  }
1563
1564  public static int getMinIndexNumEntries(Configuration conf) {
1565    return conf.getInt(MIN_INDEX_NUM_ENTRIES_KEY, DEFAULT_MIN_INDEX_NUM_ENTRIES);
1566  }
1567}