001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.io.hfile; 019 020import static org.apache.hadoop.hbase.io.hfile.HFileBlockIndex.MID_KEY_METADATA_SIZE; 021 022import java.io.DataInput; 023import java.io.DataInputStream; 024import java.io.DataOutput; 025import java.io.IOException; 026import java.util.concurrent.atomic.AtomicReference; 027import org.apache.hadoop.hbase.Cell; 028import org.apache.hadoop.hbase.CellComparator; 029import org.apache.hadoop.hbase.CellUtil; 030import org.apache.hadoop.hbase.ExtendedCell; 031import org.apache.hadoop.hbase.HBaseInterfaceAudience; 032import org.apache.hadoop.hbase.KeyValue; 033import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 034import org.apache.hadoop.hbase.io.encoding.IndexBlockEncoding; 035import org.apache.hadoop.hbase.nio.ByteBuff; 036import org.apache.hadoop.hbase.regionserver.KeyValueScanner; 037import org.apache.hadoop.hbase.util.Bytes; 038import org.apache.hadoop.hbase.util.ClassSize; 039import org.apache.yetus.audience.InterfaceAudience; 040 041/** 042 * Does not perform any kind of encoding/decoding. 043 */ 044@InterfaceAudience.Private 045public class NoOpIndexBlockEncoder implements HFileIndexBlockEncoder { 046 047 public static final NoOpIndexBlockEncoder INSTANCE = new NoOpIndexBlockEncoder(); 048 049 /** Cannot be instantiated. Use {@link #INSTANCE} instead. */ 050 private NoOpIndexBlockEncoder() { 051 } 052 053 @Override 054 public void saveMetadata(HFile.Writer writer) { 055 } 056 057 @Override 058 public void encode(BlockIndexChunk blockIndexChunk, boolean rootIndexBlock, DataOutput out) 059 throws IOException { 060 if (rootIndexBlock) { 061 writeRoot(blockIndexChunk, out); 062 } else { 063 writeNonRoot(blockIndexChunk, out); 064 } 065 } 066 067 /** 068 * Writes the block index chunk in the non-root index block format. This format contains the 069 * number of entries, an index of integer offsets for quick binary search on variable-length 070 * records, and tuples of block offset, on-disk block size, and the first key for each entry. 071 */ 072 private void writeNonRoot(BlockIndexChunk blockIndexChunk, DataOutput out) throws IOException { 073 // The number of entries in the block. 074 out.writeInt(blockIndexChunk.getNumEntries()); 075 076 if ( 077 blockIndexChunk.getSecondaryIndexOffsetMarks().size() != blockIndexChunk.getBlockKeys().size() 078 ) { 079 throw new IOException("Corrupted block index chunk writer: " 080 + blockIndexChunk.getBlockKeys().size() + " entries but " 081 + blockIndexChunk.getSecondaryIndexOffsetMarks().size() + " secondary index items"); 082 } 083 084 // For each entry, write a "secondary index" of relative offsets to the 085 // entries from the end of the secondary index. This works, because at 086 // read time we read the number of entries and know where the secondary 087 // index ends. 088 for (int currentSecondaryIndex : blockIndexChunk.getSecondaryIndexOffsetMarks()) 089 out.writeInt(currentSecondaryIndex); 090 091 // We include one other element in the secondary index to calculate the 092 // size of each entry more easily by subtracting secondary index elements. 093 out.writeInt(blockIndexChunk.getCurTotalNonRootEntrySize()); 094 095 for (int i = 0; i < blockIndexChunk.getNumEntries(); ++i) { 096 out.writeLong(blockIndexChunk.getBlockOffset(i)); 097 out.writeInt(blockIndexChunk.getOnDiskDataSize(i)); 098 out.write(blockIndexChunk.getBlockKey(i)); 099 } 100 } 101 102 /** 103 * Writes this chunk into the given output stream in the root block index format. This format is 104 * similar to the {@link HFile} version 1 block index format, except that we store on-disk size of 105 * the block instead of its uncompressed size. 106 * @param out the data output stream to write the block index to. Typically a stream writing into 107 * an {@link HFile} block. 108 */ 109 private void writeRoot(BlockIndexChunk blockIndexChunk, DataOutput out) throws IOException { 110 for (int i = 0; i < blockIndexChunk.getNumEntries(); ++i) { 111 out.writeLong(blockIndexChunk.getBlockOffset(i)); 112 out.writeInt(blockIndexChunk.getOnDiskDataSize(i)); 113 Bytes.writeByteArray(out, blockIndexChunk.getBlockKey(i)); 114 } 115 } 116 117 @Override 118 public IndexBlockEncoding getIndexBlockEncoding() { 119 return IndexBlockEncoding.NONE; 120 } 121 122 @Override 123 public EncodedSeeker createSeeker() { 124 return new NoOpEncodedSeeker(); 125 } 126 127 @Override 128 public String toString() { 129 return getClass().getSimpleName(); 130 } 131 132 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.UNITTEST) 133 public static class NoOpEncodedSeeker implements EncodedSeeker { 134 135 protected long[] blockOffsets; 136 protected int[] blockDataSizes; 137 protected int rootCount = 0; 138 139 // Mid-key metadata. 140 protected long midLeafBlockOffset = -1; 141 protected int midLeafBlockOnDiskSize = -1; 142 protected int midKeyEntry = -1; 143 144 private ExtendedCell[] blockKeys; 145 private CellComparator comparator; 146 protected int searchTreeLevel; 147 148 /** Pre-computed mid-key */ 149 private AtomicReference<ExtendedCell> midKey = new AtomicReference<>(); 150 151 @Override 152 public long heapSize() { 153 long heapSize = ClassSize.align(ClassSize.OBJECT); 154 155 // Mid-key metadata. 156 heapSize += MID_KEY_METADATA_SIZE; 157 158 if (blockOffsets != null) { 159 heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length * Bytes.SIZEOF_LONG); 160 } 161 162 if (blockDataSizes != null) { 163 heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length * Bytes.SIZEOF_INT); 164 } 165 166 if (blockKeys != null) { 167 heapSize += ClassSize.REFERENCE; 168 // Adding array + references overhead 169 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE); 170 171 // Adding blockKeys 172 for (Cell key : blockKeys) { 173 heapSize += ClassSize.align(key.heapSize()); 174 } 175 } 176 // Add comparator and the midkey atomicreference 177 heapSize += 2 * ClassSize.REFERENCE; 178 // Add rootCount and searchTreeLevel 179 heapSize += 2 * Bytes.SIZEOF_INT; 180 181 return ClassSize.align(heapSize); 182 } 183 184 @Override 185 public boolean isEmpty() { 186 return blockKeys.length == 0; 187 } 188 189 @Override 190 public ExtendedCell getRootBlockKey(int i) { 191 return blockKeys[i]; 192 } 193 194 @Override 195 public int getRootBlockCount() { 196 return rootCount; 197 } 198 199 @Override 200 public void initRootIndex(HFileBlock blk, int numEntries, CellComparator comparator, 201 int treeLevel) throws IOException { 202 this.comparator = comparator; 203 this.searchTreeLevel = treeLevel; 204 init(blk, numEntries); 205 } 206 207 private void init(HFileBlock blk, int numEntries) throws IOException { 208 DataInputStream in = readRootIndex(blk, numEntries); 209 // HFileBlock.getByteStream() returns a byte stream for reading the data(excluding checksum) 210 // of root index block, so after reading the root index there is no need to subtract the 211 // checksum bytes. 212 if (in.available() < MID_KEY_METADATA_SIZE) { 213 // No mid-key metadata available. 214 return; 215 } 216 midLeafBlockOffset = in.readLong(); 217 midLeafBlockOnDiskSize = in.readInt(); 218 midKeyEntry = in.readInt(); 219 } 220 221 private DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException { 222 DataInputStream in = blk.getByteStream(); 223 readRootIndex(in, numEntries); 224 return in; 225 } 226 227 private void readRootIndex(DataInput in, final int numEntries) throws IOException { 228 blockOffsets = new long[numEntries]; 229 initialize(numEntries); 230 blockDataSizes = new int[numEntries]; 231 232 // If index size is zero, no index was written. 233 if (numEntries > 0) { 234 for (int i = 0; i < numEntries; ++i) { 235 long offset = in.readLong(); 236 int dataSize = in.readInt(); 237 byte[] key = Bytes.readByteArray(in); 238 add(key, offset, dataSize); 239 } 240 } 241 } 242 243 private void initialize(int numEntries) { 244 blockKeys = new ExtendedCell[numEntries]; 245 } 246 247 private void add(final byte[] key, final long offset, final int dataSize) { 248 blockOffsets[rootCount] = offset; 249 // Create the blockKeys as Cells once when the reader is opened 250 blockKeys[rootCount] = new KeyValue.KeyOnlyKeyValue(key, 0, key.length); 251 blockDataSizes[rootCount] = dataSize; 252 rootCount++; 253 } 254 255 @Override 256 public ExtendedCell midkey(HFile.CachingBlockReader cachingBlockReader) throws IOException { 257 if (rootCount == 0) { 258 throw new IOException("HFile empty"); 259 } 260 261 ExtendedCell targetMidKey = this.midKey.get(); 262 if (targetMidKey != null) { 263 return targetMidKey; 264 } 265 266 if (midLeafBlockOffset >= 0) { 267 if (cachingBlockReader == null) { 268 throw new IOException( 269 "Have to read the middle leaf block but " + "no block reader available"); 270 } 271 272 // Caching, using pread, assuming this is not a compaction. 273 HFileBlock midLeafBlock = cachingBlockReader.readBlock(midLeafBlockOffset, 274 midLeafBlockOnDiskSize, true, true, false, true, BlockType.LEAF_INDEX, null); 275 try { 276 byte[] bytes = HFileBlockIndex.BlockIndexReader 277 .getNonRootIndexedKey(midLeafBlock.getBufferWithoutHeader(), midKeyEntry); 278 assert bytes != null; 279 targetMidKey = new KeyValue.KeyOnlyKeyValue(bytes, 0, bytes.length); 280 } finally { 281 midLeafBlock.release(); 282 } 283 } else { 284 // The middle of the root-level index. 285 targetMidKey = blockKeys[rootCount / 2]; 286 } 287 288 this.midKey.set(targetMidKey); 289 return targetMidKey; 290 } 291 292 @Override 293 public BlockWithScanInfo loadDataBlockWithScanInfo(ExtendedCell key, HFileBlock currentBlock, 294 boolean cacheBlocks, boolean pread, boolean isCompaction, 295 DataBlockEncoding expectedDataBlockEncoding, HFile.CachingBlockReader cachingBlockReader) 296 throws IOException { 297 int rootLevelIndex = rootBlockContainingKey(key); 298 if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) { 299 return null; 300 } 301 302 // the next indexed key 303 ExtendedCell nextIndexedKey = null; 304 305 // Read the next-level (intermediate or leaf) index block. 306 long currentOffset = blockOffsets[rootLevelIndex]; 307 int currentOnDiskSize = blockDataSizes[rootLevelIndex]; 308 309 if (rootLevelIndex < blockKeys.length - 1) { 310 nextIndexedKey = blockKeys[rootLevelIndex + 1]; 311 } else { 312 nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY; 313 } 314 315 int lookupLevel = 1; // How many levels deep we are in our lookup. 316 int index = -1; 317 318 HFileBlock block = null; 319 KeyValue.KeyOnlyKeyValue tmpNextIndexKV = new KeyValue.KeyOnlyKeyValue(); 320 while (true) { 321 try { 322 // Must initialize it with null here, because if don't and once an exception happen in 323 // readBlock, then we'll release the previous assigned block twice in the finally block. 324 // (See HBASE-22422) 325 block = null; 326 if (currentBlock != null && currentBlock.getOffset() == currentOffset) { 327 // Avoid reading the same block again, even with caching turned off. 328 // This is crucial for compaction-type workload which might have 329 // caching turned off. This is like a one-block cache inside the 330 // scanner. 331 block = currentBlock; 332 } else { 333 // Call HFile's caching block reader API. We always cache index 334 // blocks, otherwise we might get terrible performance. 335 boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel); 336 BlockType expectedBlockType; 337 if (lookupLevel < searchTreeLevel - 1) { 338 expectedBlockType = BlockType.INTERMEDIATE_INDEX; 339 } else if (lookupLevel == searchTreeLevel - 1) { 340 expectedBlockType = BlockType.LEAF_INDEX; 341 } else { 342 // this also accounts for ENCODED_DATA 343 expectedBlockType = BlockType.DATA; 344 } 345 block = cachingBlockReader.readBlock(currentOffset, currentOnDiskSize, shouldCache, 346 pread, isCompaction, true, expectedBlockType, expectedDataBlockEncoding); 347 } 348 349 if (block == null) { 350 throw new IOException("Failed to read block at offset " + currentOffset 351 + ", onDiskSize=" + currentOnDiskSize); 352 } 353 354 // Found a data block, break the loop and check our level in the tree. 355 if (block.getBlockType().isData()) { 356 break; 357 } 358 359 // Not a data block. This must be a leaf-level or intermediate-level 360 // index block. We don't allow going deeper than searchTreeLevel. 361 if (++lookupLevel > searchTreeLevel) { 362 throw new IOException("Search Tree Level overflow: lookupLevel=" + lookupLevel 363 + ", searchTreeLevel=" + searchTreeLevel); 364 } 365 366 // Locate the entry corresponding to the given key in the non-root 367 // (leaf or intermediate-level) index block. 368 ByteBuff buffer = block.getBufferWithoutHeader(); 369 index = HFileBlockIndex.BlockIndexReader.locateNonRootIndexEntry(buffer, key, comparator); 370 if (index == -1) { 371 // This has to be changed 372 // For now change this to key value 373 throw new IOException("The key " + CellUtil.getCellKeyAsString(key) + " is before the" 374 + " first key of the non-root index block " + block); 375 } 376 377 currentOffset = buffer.getLong(); 378 currentOnDiskSize = buffer.getInt(); 379 380 // Only update next indexed key if there is a next indexed key in the current level 381 byte[] nonRootIndexedKey = 382 HFileBlockIndex.BlockIndexReader.getNonRootIndexedKey(buffer, index + 1); 383 if (nonRootIndexedKey != null) { 384 tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length); 385 nextIndexedKey = tmpNextIndexKV; 386 } 387 } finally { 388 if (block != null && !block.getBlockType().isData()) { 389 // Release the block immediately if it is not the data block 390 block.release(); 391 } 392 } 393 } 394 395 if (lookupLevel != searchTreeLevel) { 396 assert block.getBlockType().isData(); 397 // Though we have retrieved a data block we have found an issue 398 // in the retrieved data block. Hence returned the block so that 399 // the ref count can be decremented 400 if (block != null) { 401 block.release(); 402 } 403 throw new IOException("Reached a data block at level " + lookupLevel 404 + " but the number of levels is " + searchTreeLevel); 405 } 406 407 // set the next indexed key for the current block. 408 return new BlockWithScanInfo(block, nextIndexedKey); 409 } 410 411 @Override 412 public int rootBlockContainingKey(Cell key) { 413 // Here the comparator should not be null as this happens for the root-level block 414 int pos = Bytes.binarySearch(blockKeys, key, comparator); 415 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see 416 // binarySearch's javadoc. 417 418 if (pos >= 0) { 419 // This means this is an exact match with an element of blockKeys. 420 assert pos < blockKeys.length; 421 return pos; 422 } 423 424 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i], 425 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that 426 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if 427 // key < blockKeys[0], meaning the file does not contain the given key. 428 429 int i = -pos - 1; 430 assert 0 <= i && i <= blockKeys.length; 431 return i - 1; 432 } 433 434 @Override 435 public String toString() { 436 StringBuilder sb = new StringBuilder(); 437 sb.append("size=" + rootCount).append("\n"); 438 for (int i = 0; i < rootCount; i++) { 439 sb.append("key=").append((blockKeys[i])).append("\n offset=").append(blockOffsets[i]) 440 .append(", dataSize=" + blockDataSizes[i]).append("\n"); 441 } 442 return sb.toString(); 443 } 444 } 445}