001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.io.hfile; 019 020import static org.apache.hadoop.hbase.io.hfile.HFileBlockIndex.MID_KEY_METADATA_SIZE; 021import static org.apache.hadoop.hbase.io.hfile.HFileBlockIndex.SECONDARY_INDEX_ENTRY_OVERHEAD; 022 023import java.io.DataInput; 024import java.io.DataInputStream; 025import java.io.DataOutput; 026import java.io.IOException; 027import java.util.concurrent.atomic.AtomicReference; 028import org.apache.hadoop.hbase.Cell; 029import org.apache.hadoop.hbase.CellComparator; 030import org.apache.hadoop.hbase.CellUtil; 031import org.apache.hadoop.hbase.KeyValue; 032import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 033import org.apache.hadoop.hbase.io.encoding.IndexBlockEncoding; 034import org.apache.hadoop.hbase.nio.ByteBuff; 035import org.apache.hadoop.hbase.regionserver.KeyValueScanner; 036import org.apache.hadoop.hbase.util.Bytes; 037import org.apache.hadoop.hbase.util.ClassSize; 038import org.apache.yetus.audience.InterfaceAudience; 039 040/** 041 * Does not perform any kind of encoding/decoding. 042 */ 043@InterfaceAudience.Private 044public class NoOpIndexBlockEncoder implements HFileIndexBlockEncoder { 045 046 public static final NoOpIndexBlockEncoder INSTANCE = new NoOpIndexBlockEncoder(); 047 048 /** Cannot be instantiated. Use {@link #INSTANCE} instead. */ 049 private NoOpIndexBlockEncoder() { 050 } 051 052 @Override 053 public void saveMetadata(HFile.Writer writer) { 054 } 055 056 @Override 057 public void encode(BlockIndexChunk blockIndexChunk, boolean rootIndexBlock, DataOutput out) 058 throws IOException { 059 if (rootIndexBlock) { 060 writeRoot(blockIndexChunk, out); 061 } else { 062 writeNonRoot(blockIndexChunk, out); 063 } 064 } 065 066 /** 067 * Writes the block index chunk in the non-root index block format. This format contains the 068 * number of entries, an index of integer offsets for quick binary search on variable-length 069 * records, and tuples of block offset, on-disk block size, and the first key for each entry. 070 */ 071 private void writeNonRoot(BlockIndexChunk blockIndexChunk, DataOutput out) throws IOException { 072 // The number of entries in the block. 073 out.writeInt(blockIndexChunk.getNumEntries()); 074 075 if ( 076 blockIndexChunk.getSecondaryIndexOffsetMarks().size() != blockIndexChunk.getBlockKeys().size() 077 ) { 078 throw new IOException("Corrupted block index chunk writer: " 079 + blockIndexChunk.getBlockKeys().size() + " entries but " 080 + blockIndexChunk.getSecondaryIndexOffsetMarks().size() + " secondary index items"); 081 } 082 083 // For each entry, write a "secondary index" of relative offsets to the 084 // entries from the end of the secondary index. This works, because at 085 // read time we read the number of entries and know where the secondary 086 // index ends. 087 for (int currentSecondaryIndex : blockIndexChunk.getSecondaryIndexOffsetMarks()) 088 out.writeInt(currentSecondaryIndex); 089 090 // We include one other element in the secondary index to calculate the 091 // size of each entry more easily by subtracting secondary index elements. 092 out.writeInt(blockIndexChunk.getCurTotalNonRootEntrySize()); 093 094 for (int i = 0; i < blockIndexChunk.getNumEntries(); ++i) { 095 out.writeLong(blockIndexChunk.getBlockOffset(i)); 096 out.writeInt(blockIndexChunk.getOnDiskDataSize(i)); 097 out.write(blockIndexChunk.getBlockKey(i)); 098 } 099 } 100 101 /** 102 * Writes this chunk into the given output stream in the root block index format. This format is 103 * similar to the {@link HFile} version 1 block index format, except that we store on-disk size of 104 * the block instead of its uncompressed size. 105 * @param out the data output stream to write the block index to. Typically a stream writing into 106 * an {@link HFile} block. 107 */ 108 private void writeRoot(BlockIndexChunk blockIndexChunk, DataOutput out) throws IOException { 109 for (int i = 0; i < blockIndexChunk.getNumEntries(); ++i) { 110 out.writeLong(blockIndexChunk.getBlockOffset(i)); 111 out.writeInt(blockIndexChunk.getOnDiskDataSize(i)); 112 Bytes.writeByteArray(out, blockIndexChunk.getBlockKey(i)); 113 } 114 } 115 116 @Override 117 public IndexBlockEncoding getIndexBlockEncoding() { 118 return IndexBlockEncoding.NONE; 119 } 120 121 @Override 122 public EncodedSeeker createSeeker() { 123 return new NoOpEncodedSeeker(); 124 } 125 126 @Override 127 public String toString() { 128 return getClass().getSimpleName(); 129 } 130 131 protected static class NoOpEncodedSeeker implements EncodedSeeker { 132 133 protected long[] blockOffsets; 134 protected int[] blockDataSizes; 135 protected int rootCount = 0; 136 137 // Mid-key metadata. 138 protected long midLeafBlockOffset = -1; 139 protected int midLeafBlockOnDiskSize = -1; 140 protected int midKeyEntry = -1; 141 142 private Cell[] blockKeys; 143 private CellComparator comparator; 144 protected int searchTreeLevel; 145 146 /** Pre-computed mid-key */ 147 private AtomicReference<Cell> midKey = new AtomicReference<>(); 148 149 @Override 150 public long heapSize() { 151 long heapSize = ClassSize.align(ClassSize.OBJECT); 152 153 // Mid-key metadata. 154 heapSize += MID_KEY_METADATA_SIZE; 155 156 if (blockOffsets != null) { 157 heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length * Bytes.SIZEOF_LONG); 158 } 159 160 if (blockDataSizes != null) { 161 heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length * Bytes.SIZEOF_INT); 162 } 163 164 if (blockKeys != null) { 165 heapSize += ClassSize.REFERENCE; 166 // Adding array + references overhead 167 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE); 168 169 // Adding blockKeys 170 for (Cell key : blockKeys) { 171 heapSize += ClassSize.align(key.heapSize()); 172 } 173 } 174 // Add comparator and the midkey atomicreference 175 heapSize += 2 * ClassSize.REFERENCE; 176 // Add rootCount and searchTreeLevel 177 heapSize += 2 * Bytes.SIZEOF_INT; 178 179 return ClassSize.align(heapSize); 180 } 181 182 @Override 183 public boolean isEmpty() { 184 return blockKeys.length == 0; 185 } 186 187 @Override 188 public Cell getRootBlockKey(int i) { 189 return blockKeys[i]; 190 } 191 192 @Override 193 public int getRootBlockCount() { 194 return rootCount; 195 } 196 197 @Override 198 public void initRootIndex(HFileBlock blk, int numEntries, CellComparator comparator, 199 int treeLevel) throws IOException { 200 this.comparator = comparator; 201 this.searchTreeLevel = treeLevel; 202 init(blk, numEntries); 203 } 204 205 private void init(HFileBlock blk, int numEntries) throws IOException { 206 DataInputStream in = readRootIndex(blk, numEntries); 207 // after reading the root index the checksum bytes have to 208 // be subtracted to know if the mid key exists. 209 int checkSumBytes = blk.totalChecksumBytes(); 210 if ((in.available() - checkSumBytes) < MID_KEY_METADATA_SIZE) { 211 // No mid-key metadata available. 212 return; 213 } 214 midLeafBlockOffset = in.readLong(); 215 midLeafBlockOnDiskSize = in.readInt(); 216 midKeyEntry = in.readInt(); 217 } 218 219 private DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException { 220 DataInputStream in = blk.getByteStream(); 221 readRootIndex(in, numEntries); 222 return in; 223 } 224 225 private void readRootIndex(DataInput in, final int numEntries) throws IOException { 226 blockOffsets = new long[numEntries]; 227 initialize(numEntries); 228 blockDataSizes = new int[numEntries]; 229 230 // If index size is zero, no index was written. 231 if (numEntries > 0) { 232 for (int i = 0; i < numEntries; ++i) { 233 long offset = in.readLong(); 234 int dataSize = in.readInt(); 235 byte[] key = Bytes.readByteArray(in); 236 add(key, offset, dataSize); 237 } 238 } 239 } 240 241 private void initialize(int numEntries) { 242 blockKeys = new Cell[numEntries]; 243 } 244 245 private void add(final byte[] key, final long offset, final int dataSize) { 246 blockOffsets[rootCount] = offset; 247 // Create the blockKeys as Cells once when the reader is opened 248 blockKeys[rootCount] = new KeyValue.KeyOnlyKeyValue(key, 0, key.length); 249 blockDataSizes[rootCount] = dataSize; 250 rootCount++; 251 } 252 253 @Override 254 public Cell midkey(HFile.CachingBlockReader cachingBlockReader) throws IOException { 255 if (rootCount == 0) throw new IOException("HFile empty"); 256 257 Cell targetMidKey = this.midKey.get(); 258 if (targetMidKey != null) { 259 return targetMidKey; 260 } 261 262 if (midLeafBlockOffset >= 0) { 263 if (cachingBlockReader == null) { 264 throw new IOException( 265 "Have to read the middle leaf block but " + "no block reader available"); 266 } 267 268 // Caching, using pread, assuming this is not a compaction. 269 HFileBlock midLeafBlock = cachingBlockReader.readBlock(midLeafBlockOffset, 270 midLeafBlockOnDiskSize, true, true, false, true, BlockType.LEAF_INDEX, null); 271 try { 272 ByteBuff b = midLeafBlock.getBufferWithoutHeader(); 273 int numDataBlocks = b.getIntAfterPosition(0); 274 int keyRelOffset = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 1)); 275 int keyLen = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 2)) - keyRelOffset 276 - SECONDARY_INDEX_ENTRY_OVERHEAD; 277 int keyOffset = 278 Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset + SECONDARY_INDEX_ENTRY_OVERHEAD; 279 byte[] bytes = b.toBytes(keyOffset, keyLen); 280 targetMidKey = new KeyValue.KeyOnlyKeyValue(bytes, 0, bytes.length); 281 } finally { 282 midLeafBlock.release(); 283 } 284 } else { 285 // The middle of the root-level index. 286 targetMidKey = blockKeys[rootCount / 2]; 287 } 288 289 this.midKey.set(targetMidKey); 290 return targetMidKey; 291 } 292 293 @Override 294 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock, 295 boolean cacheBlocks, boolean pread, boolean isCompaction, 296 DataBlockEncoding expectedDataBlockEncoding, HFile.CachingBlockReader cachingBlockReader) 297 throws IOException { 298 int rootLevelIndex = rootBlockContainingKey(key); 299 if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) { 300 return null; 301 } 302 303 // the next indexed key 304 Cell nextIndexedKey = null; 305 306 // Read the next-level (intermediate or leaf) index block. 307 long currentOffset = blockOffsets[rootLevelIndex]; 308 int currentOnDiskSize = blockDataSizes[rootLevelIndex]; 309 310 if (rootLevelIndex < blockKeys.length - 1) { 311 nextIndexedKey = blockKeys[rootLevelIndex + 1]; 312 } else { 313 nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY; 314 } 315 316 int lookupLevel = 1; // How many levels deep we are in our lookup. 317 int index = -1; 318 319 HFileBlock block = null; 320 KeyValue.KeyOnlyKeyValue tmpNextIndexKV = new KeyValue.KeyOnlyKeyValue(); 321 while (true) { 322 try { 323 // Must initialize it with null here, because if don't and once an exception happen in 324 // readBlock, then we'll release the previous assigned block twice in the finally block. 325 // (See HBASE-22422) 326 block = null; 327 if (currentBlock != null && currentBlock.getOffset() == currentOffset) { 328 // Avoid reading the same block again, even with caching turned off. 329 // This is crucial for compaction-type workload which might have 330 // caching turned off. This is like a one-block cache inside the 331 // scanner. 332 block = currentBlock; 333 } else { 334 // Call HFile's caching block reader API. We always cache index 335 // blocks, otherwise we might get terrible performance. 336 boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel); 337 BlockType expectedBlockType; 338 if (lookupLevel < searchTreeLevel - 1) { 339 expectedBlockType = BlockType.INTERMEDIATE_INDEX; 340 } else if (lookupLevel == searchTreeLevel - 1) { 341 expectedBlockType = BlockType.LEAF_INDEX; 342 } else { 343 // this also accounts for ENCODED_DATA 344 expectedBlockType = BlockType.DATA; 345 } 346 block = cachingBlockReader.readBlock(currentOffset, currentOnDiskSize, shouldCache, 347 pread, isCompaction, true, expectedBlockType, expectedDataBlockEncoding); 348 } 349 350 if (block == null) { 351 throw new IOException("Failed to read block at offset " + currentOffset 352 + ", onDiskSize=" + currentOnDiskSize); 353 } 354 355 // Found a data block, break the loop and check our level in the tree. 356 if (block.getBlockType().isData()) { 357 break; 358 } 359 360 // Not a data block. This must be a leaf-level or intermediate-level 361 // index block. We don't allow going deeper than searchTreeLevel. 362 if (++lookupLevel > searchTreeLevel) { 363 throw new IOException("Search Tree Level overflow: lookupLevel=" + lookupLevel 364 + ", searchTreeLevel=" + searchTreeLevel); 365 } 366 367 // Locate the entry corresponding to the given key in the non-root 368 // (leaf or intermediate-level) index block. 369 ByteBuff buffer = block.getBufferWithoutHeader(); 370 index = HFileBlockIndex.BlockIndexReader.locateNonRootIndexEntry(buffer, key, comparator); 371 if (index == -1) { 372 // This has to be changed 373 // For now change this to key value 374 throw new IOException("The key " + CellUtil.getCellKeyAsString(key) + " is before the" 375 + " first key of the non-root index block " + block); 376 } 377 378 currentOffset = buffer.getLong(); 379 currentOnDiskSize = buffer.getInt(); 380 381 // Only update next indexed key if there is a next indexed key in the current level 382 byte[] nonRootIndexedKey = getNonRootIndexedKey(buffer, index + 1); 383 if (nonRootIndexedKey != null) { 384 tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length); 385 nextIndexedKey = tmpNextIndexKV; 386 } 387 } finally { 388 if (block != null && !block.getBlockType().isData()) { 389 // Release the block immediately if it is not the data block 390 block.release(); 391 } 392 } 393 } 394 395 if (lookupLevel != searchTreeLevel) { 396 assert block.getBlockType().isData(); 397 // Though we have retrieved a data block we have found an issue 398 // in the retrieved data block. Hence returned the block so that 399 // the ref count can be decremented 400 if (block != null) { 401 block.release(); 402 } 403 throw new IOException("Reached a data block at level " + lookupLevel 404 + " but the number of levels is " + searchTreeLevel); 405 } 406 407 // set the next indexed key for the current block. 408 return new BlockWithScanInfo(block, nextIndexedKey); 409 } 410 411 @Override 412 public int rootBlockContainingKey(Cell key) { 413 // Here the comparator should not be null as this happens for the root-level block 414 int pos = Bytes.binarySearch(blockKeys, key, comparator); 415 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see 416 // binarySearch's javadoc. 417 418 if (pos >= 0) { 419 // This means this is an exact match with an element of blockKeys. 420 assert pos < blockKeys.length; 421 return pos; 422 } 423 424 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i], 425 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that 426 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if 427 // key < blockKeys[0], meaning the file does not contain the given key. 428 429 int i = -pos - 1; 430 assert 0 <= i && i <= blockKeys.length; 431 return i - 1; 432 } 433 434 @Override 435 public String toString() { 436 StringBuilder sb = new StringBuilder(); 437 sb.append("size=" + rootCount).append("\n"); 438 for (int i = 0; i < rootCount; i++) { 439 sb.append("key=").append((blockKeys[i])).append("\n offset=").append(blockOffsets[i]) 440 .append(", dataSize=" + blockDataSizes[i]).append("\n"); 441 } 442 return sb.toString(); 443 } 444 445 /** 446 * The indexed key at the ith position in the nonRootIndex. The position starts at 0. 447 * @param i the ith position 448 * @return The indexed key at the ith position in the nonRootIndex. 449 */ 450 protected byte[] getNonRootIndexedKey(ByteBuff nonRootIndex, int i) { 451 int numEntries = nonRootIndex.getInt(0); 452 if (i < 0 || i >= numEntries) { 453 return null; 454 } 455 456 // Entries start after the number of entries and the secondary index. 457 // The secondary index takes numEntries + 1 ints. 458 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2); 459 // Targetkey's offset relative to the end of secondary index 460 int targetKeyRelOffset = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 1)); 461 462 // The offset of the target key in the blockIndex buffer 463 int targetKeyOffset = entriesOffset // Skip secondary index 464 + targetKeyRelOffset // Skip all entries until mid 465 + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size 466 467 // We subtract the two consecutive secondary index elements, which 468 // gives us the size of the whole (offset, onDiskSize, key) tuple. We 469 // then need to subtract the overhead of offset and onDiskSize. 470 int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) - targetKeyRelOffset 471 - SECONDARY_INDEX_ENTRY_OVERHEAD; 472 473 // TODO check whether we can make BB backed Cell here? So can avoid bytes copy. 474 return nonRootIndex.toBytes(targetKeyOffset, targetKeyLength); 475 } 476 } 477}