001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static org.apache.hadoop.hbase.io.hfile.HFileBlockIndex.MID_KEY_METADATA_SIZE;
021import static org.apache.hadoop.hbase.io.hfile.HFileBlockIndex.SECONDARY_INDEX_ENTRY_OVERHEAD;
022
023import java.io.DataInput;
024import java.io.DataInputStream;
025import java.io.DataOutput;
026import java.io.IOException;
027import java.util.concurrent.atomic.AtomicReference;
028import org.apache.hadoop.hbase.Cell;
029import org.apache.hadoop.hbase.CellComparator;
030import org.apache.hadoop.hbase.CellUtil;
031import org.apache.hadoop.hbase.KeyValue;
032import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
033import org.apache.hadoop.hbase.io.encoding.IndexBlockEncoding;
034import org.apache.hadoop.hbase.nio.ByteBuff;
035import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
036import org.apache.hadoop.hbase.util.Bytes;
037import org.apache.hadoop.hbase.util.ClassSize;
038import org.apache.yetus.audience.InterfaceAudience;
039
040/**
041 * Does not perform any kind of encoding/decoding.
042 */
043@InterfaceAudience.Private
044public class NoOpIndexBlockEncoder implements HFileIndexBlockEncoder {
045
046  public static final NoOpIndexBlockEncoder INSTANCE = new NoOpIndexBlockEncoder();
047
048  /** Cannot be instantiated. Use {@link #INSTANCE} instead. */
049  private NoOpIndexBlockEncoder() {
050  }
051
052  @Override
053  public void saveMetadata(HFile.Writer writer) {
054  }
055
056  @Override
057  public void encode(BlockIndexChunk blockIndexChunk, boolean rootIndexBlock, DataOutput out)
058    throws IOException {
059    if (rootIndexBlock) {
060      writeRoot(blockIndexChunk, out);
061    } else {
062      writeNonRoot(blockIndexChunk, out);
063    }
064  }
065
066  /**
067   * Writes the block index chunk in the non-root index block format. This format contains the
068   * number of entries, an index of integer offsets for quick binary search on variable-length
069   * records, and tuples of block offset, on-disk block size, and the first key for each entry.
070   */
071  private void writeNonRoot(BlockIndexChunk blockIndexChunk, DataOutput out) throws IOException {
072    // The number of entries in the block.
073    out.writeInt(blockIndexChunk.getNumEntries());
074
075    if (
076      blockIndexChunk.getSecondaryIndexOffsetMarks().size() != blockIndexChunk.getBlockKeys().size()
077    ) {
078      throw new IOException("Corrupted block index chunk writer: "
079        + blockIndexChunk.getBlockKeys().size() + " entries but "
080        + blockIndexChunk.getSecondaryIndexOffsetMarks().size() + " secondary index items");
081    }
082
083    // For each entry, write a "secondary index" of relative offsets to the
084    // entries from the end of the secondary index. This works, because at
085    // read time we read the number of entries and know where the secondary
086    // index ends.
087    for (int currentSecondaryIndex : blockIndexChunk.getSecondaryIndexOffsetMarks())
088      out.writeInt(currentSecondaryIndex);
089
090    // We include one other element in the secondary index to calculate the
091    // size of each entry more easily by subtracting secondary index elements.
092    out.writeInt(blockIndexChunk.getCurTotalNonRootEntrySize());
093
094    for (int i = 0; i < blockIndexChunk.getNumEntries(); ++i) {
095      out.writeLong(blockIndexChunk.getBlockOffset(i));
096      out.writeInt(blockIndexChunk.getOnDiskDataSize(i));
097      out.write(blockIndexChunk.getBlockKey(i));
098    }
099  }
100
101  /**
102   * Writes this chunk into the given output stream in the root block index format. This format is
103   * similar to the {@link HFile} version 1 block index format, except that we store on-disk size of
104   * the block instead of its uncompressed size.
105   * @param out the data output stream to write the block index to. Typically a stream writing into
106   *            an {@link HFile} block.
107   */
108  private void writeRoot(BlockIndexChunk blockIndexChunk, DataOutput out) throws IOException {
109    for (int i = 0; i < blockIndexChunk.getNumEntries(); ++i) {
110      out.writeLong(blockIndexChunk.getBlockOffset(i));
111      out.writeInt(blockIndexChunk.getOnDiskDataSize(i));
112      Bytes.writeByteArray(out, blockIndexChunk.getBlockKey(i));
113    }
114  }
115
116  @Override
117  public IndexBlockEncoding getIndexBlockEncoding() {
118    return IndexBlockEncoding.NONE;
119  }
120
121  @Override
122  public EncodedSeeker createSeeker() {
123    return new NoOpEncodedSeeker();
124  }
125
126  @Override
127  public String toString() {
128    return getClass().getSimpleName();
129  }
130
131  protected static class NoOpEncodedSeeker implements EncodedSeeker {
132
133    protected long[] blockOffsets;
134    protected int[] blockDataSizes;
135    protected int rootCount = 0;
136
137    // Mid-key metadata.
138    protected long midLeafBlockOffset = -1;
139    protected int midLeafBlockOnDiskSize = -1;
140    protected int midKeyEntry = -1;
141
142    private Cell[] blockKeys;
143    private CellComparator comparator;
144    protected int searchTreeLevel;
145
146    /** Pre-computed mid-key */
147    private AtomicReference<Cell> midKey = new AtomicReference<>();
148
149    @Override
150    public long heapSize() {
151      long heapSize = ClassSize.align(ClassSize.OBJECT);
152
153      // Mid-key metadata.
154      heapSize += MID_KEY_METADATA_SIZE;
155
156      if (blockOffsets != null) {
157        heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length * Bytes.SIZEOF_LONG);
158      }
159
160      if (blockDataSizes != null) {
161        heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length * Bytes.SIZEOF_INT);
162      }
163
164      if (blockKeys != null) {
165        heapSize += ClassSize.REFERENCE;
166        // Adding array + references overhead
167        heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE);
168
169        // Adding blockKeys
170        for (Cell key : blockKeys) {
171          heapSize += ClassSize.align(key.heapSize());
172        }
173      }
174      // Add comparator and the midkey atomicreference
175      heapSize += 2 * ClassSize.REFERENCE;
176      // Add rootCount and searchTreeLevel
177      heapSize += 2 * Bytes.SIZEOF_INT;
178
179      return ClassSize.align(heapSize);
180    }
181
182    @Override
183    public boolean isEmpty() {
184      return blockKeys.length == 0;
185    }
186
187    @Override
188    public Cell getRootBlockKey(int i) {
189      return blockKeys[i];
190    }
191
192    @Override
193    public int getRootBlockCount() {
194      return rootCount;
195    }
196
197    @Override
198    public void initRootIndex(HFileBlock blk, int numEntries, CellComparator comparator,
199      int treeLevel) throws IOException {
200      this.comparator = comparator;
201      this.searchTreeLevel = treeLevel;
202      init(blk, numEntries);
203    }
204
205    private void init(HFileBlock blk, int numEntries) throws IOException {
206      DataInputStream in = readRootIndex(blk, numEntries);
207      // after reading the root index the checksum bytes have to
208      // be subtracted to know if the mid key exists.
209      int checkSumBytes = blk.totalChecksumBytes();
210      if ((in.available() - checkSumBytes) < MID_KEY_METADATA_SIZE) {
211        // No mid-key metadata available.
212        return;
213      }
214      midLeafBlockOffset = in.readLong();
215      midLeafBlockOnDiskSize = in.readInt();
216      midKeyEntry = in.readInt();
217    }
218
219    private DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException {
220      DataInputStream in = blk.getByteStream();
221      readRootIndex(in, numEntries);
222      return in;
223    }
224
225    private void readRootIndex(DataInput in, final int numEntries) throws IOException {
226      blockOffsets = new long[numEntries];
227      initialize(numEntries);
228      blockDataSizes = new int[numEntries];
229
230      // If index size is zero, no index was written.
231      if (numEntries > 0) {
232        for (int i = 0; i < numEntries; ++i) {
233          long offset = in.readLong();
234          int dataSize = in.readInt();
235          byte[] key = Bytes.readByteArray(in);
236          add(key, offset, dataSize);
237        }
238      }
239    }
240
241    private void initialize(int numEntries) {
242      blockKeys = new Cell[numEntries];
243    }
244
245    private void add(final byte[] key, final long offset, final int dataSize) {
246      blockOffsets[rootCount] = offset;
247      // Create the blockKeys as Cells once when the reader is opened
248      blockKeys[rootCount] = new KeyValue.KeyOnlyKeyValue(key, 0, key.length);
249      blockDataSizes[rootCount] = dataSize;
250      rootCount++;
251    }
252
253    @Override
254    public Cell midkey(HFile.CachingBlockReader cachingBlockReader) throws IOException {
255      if (rootCount == 0) throw new IOException("HFile empty");
256
257      Cell targetMidKey = this.midKey.get();
258      if (targetMidKey != null) {
259        return targetMidKey;
260      }
261
262      if (midLeafBlockOffset >= 0) {
263        if (cachingBlockReader == null) {
264          throw new IOException(
265            "Have to read the middle leaf block but " + "no block reader available");
266        }
267
268        // Caching, using pread, assuming this is not a compaction.
269        HFileBlock midLeafBlock = cachingBlockReader.readBlock(midLeafBlockOffset,
270          midLeafBlockOnDiskSize, true, true, false, true, BlockType.LEAF_INDEX, null);
271        try {
272          ByteBuff b = midLeafBlock.getBufferWithoutHeader();
273          int numDataBlocks = b.getIntAfterPosition(0);
274          int keyRelOffset = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 1));
275          int keyLen = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 2)) - keyRelOffset
276            - SECONDARY_INDEX_ENTRY_OVERHEAD;
277          int keyOffset =
278            Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset + SECONDARY_INDEX_ENTRY_OVERHEAD;
279          byte[] bytes = b.toBytes(keyOffset, keyLen);
280          targetMidKey = new KeyValue.KeyOnlyKeyValue(bytes, 0, bytes.length);
281        } finally {
282          midLeafBlock.release();
283        }
284      } else {
285        // The middle of the root-level index.
286        targetMidKey = blockKeys[rootCount / 2];
287      }
288
289      this.midKey.set(targetMidKey);
290      return targetMidKey;
291    }
292
293    @Override
294    public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
295      boolean cacheBlocks, boolean pread, boolean isCompaction,
296      DataBlockEncoding expectedDataBlockEncoding, HFile.CachingBlockReader cachingBlockReader)
297      throws IOException {
298      int rootLevelIndex = rootBlockContainingKey(key);
299      if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) {
300        return null;
301      }
302
303      // the next indexed key
304      Cell nextIndexedKey = null;
305
306      // Read the next-level (intermediate or leaf) index block.
307      long currentOffset = blockOffsets[rootLevelIndex];
308      int currentOnDiskSize = blockDataSizes[rootLevelIndex];
309
310      if (rootLevelIndex < blockKeys.length - 1) {
311        nextIndexedKey = blockKeys[rootLevelIndex + 1];
312      } else {
313        nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY;
314      }
315
316      int lookupLevel = 1; // How many levels deep we are in our lookup.
317      int index = -1;
318
319      HFileBlock block = null;
320      KeyValue.KeyOnlyKeyValue tmpNextIndexKV = new KeyValue.KeyOnlyKeyValue();
321      while (true) {
322        try {
323          // Must initialize it with null here, because if don't and once an exception happen in
324          // readBlock, then we'll release the previous assigned block twice in the finally block.
325          // (See HBASE-22422)
326          block = null;
327          if (currentBlock != null && currentBlock.getOffset() == currentOffset) {
328            // Avoid reading the same block again, even with caching turned off.
329            // This is crucial for compaction-type workload which might have
330            // caching turned off. This is like a one-block cache inside the
331            // scanner.
332            block = currentBlock;
333          } else {
334            // Call HFile's caching block reader API. We always cache index
335            // blocks, otherwise we might get terrible performance.
336            boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel);
337            BlockType expectedBlockType;
338            if (lookupLevel < searchTreeLevel - 1) {
339              expectedBlockType = BlockType.INTERMEDIATE_INDEX;
340            } else if (lookupLevel == searchTreeLevel - 1) {
341              expectedBlockType = BlockType.LEAF_INDEX;
342            } else {
343              // this also accounts for ENCODED_DATA
344              expectedBlockType = BlockType.DATA;
345            }
346            block = cachingBlockReader.readBlock(currentOffset, currentOnDiskSize, shouldCache,
347              pread, isCompaction, true, expectedBlockType, expectedDataBlockEncoding);
348          }
349
350          if (block == null) {
351            throw new IOException("Failed to read block at offset " + currentOffset
352              + ", onDiskSize=" + currentOnDiskSize);
353          }
354
355          // Found a data block, break the loop and check our level in the tree.
356          if (block.getBlockType().isData()) {
357            break;
358          }
359
360          // Not a data block. This must be a leaf-level or intermediate-level
361          // index block. We don't allow going deeper than searchTreeLevel.
362          if (++lookupLevel > searchTreeLevel) {
363            throw new IOException("Search Tree Level overflow: lookupLevel=" + lookupLevel
364              + ", searchTreeLevel=" + searchTreeLevel);
365          }
366
367          // Locate the entry corresponding to the given key in the non-root
368          // (leaf or intermediate-level) index block.
369          ByteBuff buffer = block.getBufferWithoutHeader();
370          index = HFileBlockIndex.BlockIndexReader.locateNonRootIndexEntry(buffer, key, comparator);
371          if (index == -1) {
372            // This has to be changed
373            // For now change this to key value
374            throw new IOException("The key " + CellUtil.getCellKeyAsString(key) + " is before the"
375              + " first key of the non-root index block " + block);
376          }
377
378          currentOffset = buffer.getLong();
379          currentOnDiskSize = buffer.getInt();
380
381          // Only update next indexed key if there is a next indexed key in the current level
382          byte[] nonRootIndexedKey = getNonRootIndexedKey(buffer, index + 1);
383          if (nonRootIndexedKey != null) {
384            tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length);
385            nextIndexedKey = tmpNextIndexKV;
386          }
387        } finally {
388          if (block != null && !block.getBlockType().isData()) {
389            // Release the block immediately if it is not the data block
390            block.release();
391          }
392        }
393      }
394
395      if (lookupLevel != searchTreeLevel) {
396        assert block.getBlockType().isData();
397        // Though we have retrieved a data block we have found an issue
398        // in the retrieved data block. Hence returned the block so that
399        // the ref count can be decremented
400        if (block != null) {
401          block.release();
402        }
403        throw new IOException("Reached a data block at level " + lookupLevel
404          + " but the number of levels is " + searchTreeLevel);
405      }
406
407      // set the next indexed key for the current block.
408      return new BlockWithScanInfo(block, nextIndexedKey);
409    }
410
411    @Override
412    public int rootBlockContainingKey(Cell key) {
413      // Here the comparator should not be null as this happens for the root-level block
414      int pos = Bytes.binarySearch(blockKeys, key, comparator);
415      // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
416      // binarySearch's javadoc.
417
418      if (pos >= 0) {
419        // This means this is an exact match with an element of blockKeys.
420        assert pos < blockKeys.length;
421        return pos;
422      }
423
424      // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
425      // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
426      // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
427      // key < blockKeys[0], meaning the file does not contain the given key.
428
429      int i = -pos - 1;
430      assert 0 <= i && i <= blockKeys.length;
431      return i - 1;
432    }
433
434    @Override
435    public String toString() {
436      StringBuilder sb = new StringBuilder();
437      sb.append("size=" + rootCount).append("\n");
438      for (int i = 0; i < rootCount; i++) {
439        sb.append("key=").append((blockKeys[i])).append("\n  offset=").append(blockOffsets[i])
440          .append(", dataSize=" + blockDataSizes[i]).append("\n");
441      }
442      return sb.toString();
443    }
444
445    /**
446     * The indexed key at the ith position in the nonRootIndex. The position starts at 0.
447     * @param i the ith position
448     * @return The indexed key at the ith position in the nonRootIndex.
449     */
450    protected byte[] getNonRootIndexedKey(ByteBuff nonRootIndex, int i) {
451      int numEntries = nonRootIndex.getInt(0);
452      if (i < 0 || i >= numEntries) {
453        return null;
454      }
455
456      // Entries start after the number of entries and the secondary index.
457      // The secondary index takes numEntries + 1 ints.
458      int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
459      // Targetkey's offset relative to the end of secondary index
460      int targetKeyRelOffset = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 1));
461
462      // The offset of the target key in the blockIndex buffer
463      int targetKeyOffset = entriesOffset // Skip secondary index
464        + targetKeyRelOffset // Skip all entries until mid
465        + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size
466
467      // We subtract the two consecutive secondary index elements, which
468      // gives us the size of the whole (offset, onDiskSize, key) tuple. We
469      // then need to subtract the overhead of offset and onDiskSize.
470      int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) - targetKeyRelOffset
471        - SECONDARY_INDEX_ENTRY_OVERHEAD;
472
473      // TODO check whether we can make BB backed Cell here? So can avoid bytes copy.
474      return nonRootIndex.toBytes(targetKeyOffset, targetKeyLength);
475    }
476  }
477}