001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with this 004 * work for additional information regarding copyright ownership. The ASF 005 * licenses this file to you under the Apache License, Version 2.0 (the 006 * "License"); you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 013 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 014 * License for the specific language governing permissions and limitations 015 * under the License. 016 */ 017package org.apache.hadoop.hbase.regionserver; 018 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.DataOutputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import java.text.DecimalFormat; 025import java.util.ArrayList; 026import java.util.Iterator; 027import java.util.List; 028import java.util.Locale; 029 030import org.apache.hadoop.conf.Configuration; 031import org.apache.hadoop.fs.FileSystem; 032import org.apache.hadoop.fs.Path; 033import org.apache.hadoop.hbase.Cell; 034import org.apache.hadoop.hbase.HBaseConfiguration; 035import org.apache.hadoop.hbase.KeyValue; 036import org.apache.hadoop.hbase.KeyValueUtil; 037import org.apache.hadoop.hbase.io.compress.Compression; 038import org.apache.hadoop.hbase.io.compress.Compression.Algorithm; 039import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder; 040import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 041import org.apache.hadoop.hbase.io.encoding.EncodedDataBlock; 042import org.apache.hadoop.hbase.io.hfile.CacheConfig; 043import org.apache.hadoop.hbase.io.hfile.HFileBlock; 044import org.apache.hadoop.hbase.io.hfile.HFileContext; 045import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; 046import org.apache.hadoop.hbase.io.hfile.HFileReaderImpl; 047import org.apache.hadoop.hbase.util.Bytes; 048import org.apache.hadoop.io.WritableUtils; 049import org.apache.hadoop.io.compress.CompressionOutputStream; 050import org.apache.hadoop.io.compress.Compressor; 051import org.apache.hadoop.io.compress.Decompressor; 052import org.slf4j.Logger; 053import org.slf4j.LoggerFactory; 054import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine; 055import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLineParser; 056import org.apache.hbase.thirdparty.org.apache.commons.cli.Option; 057import org.apache.hbase.thirdparty.org.apache.commons.cli.Options; 058import org.apache.hbase.thirdparty.org.apache.commons.cli.ParseException; 059import org.apache.hbase.thirdparty.org.apache.commons.cli.PosixParser; 060 061/** 062 * Tests various algorithms for key compression on an existing HFile. Useful 063 * for testing, debugging and benchmarking. 064 */ 065public class DataBlockEncodingTool { 066 private static final Logger LOG = LoggerFactory.getLogger( 067 DataBlockEncodingTool.class); 068 069 private static final boolean includesMemstoreTS = true; 070 071 /** 072 * How many times to run the benchmark. More times means better data in terms 073 * of statistics but slower execution. Has to be strictly larger than 074 * {@link #DEFAULT_BENCHMARK_N_OMIT}. 075 */ 076 private static final int DEFAULT_BENCHMARK_N_TIMES = 12; 077 078 /** 079 * How many first runs should not be included in the benchmark. Done in order 080 * to exclude setup cost. 081 */ 082 private static final int DEFAULT_BENCHMARK_N_OMIT = 2; 083 084 /** HFile name to be used in benchmark */ 085 private static final String OPT_HFILE_NAME = "f"; 086 087 /** Maximum number of key/value pairs to process in a single benchmark run */ 088 private static final String OPT_KV_LIMIT = "n"; 089 090 /** Whether to run a benchmark to measure read throughput */ 091 private static final String OPT_MEASURE_THROUGHPUT = "b"; 092 093 /** If this is specified, no correctness testing will be done */ 094 private static final String OPT_OMIT_CORRECTNESS_TEST = "c"; 095 096 /** What compression algorithm to test */ 097 private static final String OPT_COMPRESSION_ALGORITHM = "a"; 098 099 /** Number of times to run each benchmark */ 100 private static final String OPT_BENCHMARK_N_TIMES = "t"; 101 102 /** Number of first runs of every benchmark to omit from statistics */ 103 private static final String OPT_BENCHMARK_N_OMIT = "omit"; 104 105 /** Compression algorithm to use if not specified on the command line */ 106 private static final Algorithm DEFAULT_COMPRESSION = 107 Compression.Algorithm.GZ; 108 109 private static final DecimalFormat DELIMITED_DECIMAL_FORMAT = 110 new DecimalFormat(); 111 112 static { 113 DELIMITED_DECIMAL_FORMAT.setGroupingSize(3); 114 } 115 116 private static final String PCT_FORMAT = "%.2f %%"; 117 private static final String INT_FORMAT = "%d"; 118 119 private static int benchmarkNTimes = DEFAULT_BENCHMARK_N_TIMES; 120 private static int benchmarkNOmit = DEFAULT_BENCHMARK_N_OMIT; 121 122 private List<EncodedDataBlock> codecs = new ArrayList<>(); 123 private long totalPrefixLength = 0; 124 private long totalKeyLength = 0; 125 private long totalValueLength = 0; 126 private long totalKeyRedundancyLength = 0; 127 private long totalCFLength = 0; 128 129 private byte[] rawKVs; 130 private boolean useHBaseChecksum = false; 131 132 private final String compressionAlgorithmName; 133 private final Algorithm compressionAlgorithm; 134 private final Compressor compressor; 135 private final Decompressor decompressor; 136 137 // Check if HFile use Tag. 138 private static boolean USE_TAG = false; 139 140 private enum Manipulation { 141 ENCODING, 142 DECODING, 143 COMPRESSION, 144 DECOMPRESSION; 145 146 @Override 147 public String toString() { 148 String s = super.toString(); 149 StringBuilder sb = new StringBuilder(); 150 sb.append(s.charAt(0)); 151 sb.append(s.substring(1).toLowerCase(Locale.ROOT)); 152 return sb.toString(); 153 } 154 } 155 156 /** 157 * @param compressionAlgorithmName What kind of algorithm should be used 158 * as baseline for comparison (e.g. lzo, gz). 159 */ 160 public DataBlockEncodingTool(String compressionAlgorithmName) { 161 this.compressionAlgorithmName = compressionAlgorithmName; 162 this.compressionAlgorithm = Compression.getCompressionAlgorithmByName( 163 compressionAlgorithmName); 164 this.compressor = this.compressionAlgorithm.getCompressor(); 165 this.decompressor = this.compressionAlgorithm.getDecompressor(); 166 } 167 168 /** 169 * Check statistics for given HFile for different data block encoders. 170 * @param scanner Of file which will be compressed. 171 * @param kvLimit Maximal count of KeyValue which will be processed. 172 * @throws IOException thrown if scanner is invalid 173 */ 174 public void checkStatistics(final KeyValueScanner scanner, final int kvLimit) 175 throws IOException { 176 scanner.seek(KeyValue.LOWESTKEY); 177 178 KeyValue currentKV; 179 180 byte[] previousKey = null; 181 byte[] currentKey; 182 183 DataBlockEncoding[] encodings = DataBlockEncoding.values(); 184 185 ByteArrayOutputStream uncompressedOutputStream = 186 new ByteArrayOutputStream(); 187 188 int j = 0; 189 while ((currentKV = KeyValueUtil.ensureKeyValue(scanner.next())) != null && j < kvLimit) { 190 // Iterates through key/value pairs 191 j++; 192 currentKey = currentKV.getKey(); 193 if (previousKey != null) { 194 for (int i = 0; i < previousKey.length && i < currentKey.length && 195 previousKey[i] == currentKey[i]; ++i) { 196 totalKeyRedundancyLength++; 197 } 198 } 199 200 // Add tagsLen zero to cells don't include tags. Since the process of 201 // scanner converts byte array to KV would abandon tagsLen part if tagsLen 202 // is zero. But we still needs the tagsLen part to check if current cell 203 // include tags. If USE_TAG is true, HFile contains cells with tags, 204 // if the cell tagsLen equals 0, it means other cells may have tags. 205 if (USE_TAG && currentKV.getTagsLength() == 0) { 206 uncompressedOutputStream.write(currentKV.getBuffer(), 207 currentKV.getOffset(), currentKV.getLength()); 208 // write tagsLen = 0. 209 uncompressedOutputStream.write(Bytes.toBytes((short) 0)); 210 } else { 211 uncompressedOutputStream.write(currentKV.getBuffer(), 212 currentKV.getOffset(), currentKV.getLength()); 213 } 214 215 if(includesMemstoreTS) { 216 WritableUtils.writeVLong( 217 new DataOutputStream(uncompressedOutputStream), currentKV.getSequenceId()); 218 } 219 220 previousKey = currentKey; 221 222 int kLen = currentKV.getKeyLength(); 223 int vLen = currentKV.getValueLength(); 224 int cfLen = currentKV.getFamilyLength(currentKV.getFamilyOffset()); 225 int restLen = currentKV.getLength() - kLen - vLen; 226 227 totalKeyLength += kLen; 228 totalValueLength += vLen; 229 totalPrefixLength += restLen; 230 totalCFLength += cfLen; 231 } 232 233 rawKVs = uncompressedOutputStream.toByteArray(); 234 for (DataBlockEncoding encoding : encodings) { 235 if (encoding == DataBlockEncoding.NONE) { 236 continue; 237 } 238 DataBlockEncoder d = encoding.getEncoder(); 239 HFileContext meta = new HFileContextBuilder() 240 .withDataBlockEncoding(encoding) 241 .withCompression(Compression.Algorithm.NONE) 242 .withIncludesMvcc(includesMemstoreTS) 243 .withIncludesTags(USE_TAG).build(); 244 codecs.add(new EncodedDataBlock(d, encoding, rawKVs, meta )); 245 } 246 } 247 248 /** 249 * Verify if all data block encoders are working properly. 250 * 251 * @param scanner Of file which was compressed. 252 * @param kvLimit Maximal count of KeyValue which will be processed. 253 * @return true if all data block encoders compressed/decompressed correctly. 254 * @throws IOException thrown if scanner is invalid 255 */ 256 public boolean verifyCodecs(final KeyValueScanner scanner, final int kvLimit) 257 throws IOException { 258 KeyValue currentKv; 259 260 scanner.seek(KeyValue.LOWESTKEY); 261 List<Iterator<Cell>> codecIterators = new ArrayList<>(); 262 for(EncodedDataBlock codec : codecs) { 263 codecIterators.add(codec.getIterator(HFileBlock.headerSize(useHBaseChecksum))); 264 } 265 266 int j = 0; 267 while ((currentKv = KeyValueUtil.ensureKeyValue(scanner.next())) != null && j < kvLimit) { 268 // Iterates through key/value pairs 269 ++j; 270 for (Iterator<Cell> it : codecIterators) { 271 Cell c = it.next(); 272 KeyValue codecKv = KeyValueUtil.ensureKeyValue(c); 273 if (codecKv == null || 0 != Bytes.compareTo( 274 codecKv.getBuffer(), codecKv.getOffset(), codecKv.getLength(), 275 currentKv.getBuffer(), currentKv.getOffset(), 276 currentKv.getLength())) { 277 if (codecKv == null) { 278 LOG.error("There is a bug in codec " + it + 279 " it returned null KeyValue,"); 280 } else { 281 int prefix = 0; 282 int limitLength = 2 * Bytes.SIZEOF_INT + 283 Math.min(codecKv.getLength(), currentKv.getLength()); 284 while (prefix < limitLength && 285 codecKv.getBuffer()[prefix + codecKv.getOffset()] == 286 currentKv.getBuffer()[prefix + currentKv.getOffset()]) { 287 prefix++; 288 } 289 290 LOG.error("There is bug in codec " + it.toString() + 291 "\n on element " + j + 292 "\n codecKv.getKeyLength() " + codecKv.getKeyLength() + 293 "\n codecKv.getValueLength() " + codecKv.getValueLength() + 294 "\n codecKv.getLength() " + codecKv.getLength() + 295 "\n currentKv.getKeyLength() " + currentKv.getKeyLength() + 296 "\n currentKv.getValueLength() " + currentKv.getValueLength() + 297 "\n codecKv.getLength() " + currentKv.getLength() + 298 "\n currentKV rowLength " + currentKv.getRowLength() + 299 " familyName " + currentKv.getFamilyLength() + 300 " qualifier " + currentKv.getQualifierLength() + 301 "\n prefix " + prefix + 302 "\n codecKv '" + Bytes.toStringBinary(codecKv.getBuffer(), 303 codecKv.getOffset(), prefix) + "' diff '" + 304 Bytes.toStringBinary(codecKv.getBuffer(), 305 codecKv.getOffset() + prefix, codecKv.getLength() - 306 prefix) + "'" + 307 "\n currentKv '" + Bytes.toStringBinary( 308 currentKv.getBuffer(), 309 currentKv.getOffset(), prefix) + "' diff '" + 310 Bytes.toStringBinary(currentKv.getBuffer(), 311 currentKv.getOffset() + prefix, currentKv.getLength() - 312 prefix) + "'" 313 ); 314 } 315 return false; 316 } 317 } 318 } 319 320 LOG.info("Verification was successful!"); 321 322 return true; 323 } 324 325 /** 326 * Benchmark codec's speed. 327 */ 328 public void benchmarkCodecs() throws IOException { 329 LOG.info("Starting a throughput benchmark for data block encoding codecs"); 330 int prevTotalSize = -1; 331 for (EncodedDataBlock codec : codecs) { 332 prevTotalSize = benchmarkEncoder(prevTotalSize, codec); 333 } 334 335 benchmarkDefaultCompression(prevTotalSize, rawKVs); 336 } 337 338 /** 339 * Benchmark compression/decompression throughput. 340 * @param previousTotalSize Total size used for verification. Use -1 if 341 * unknown. 342 * @param codec Tested encoder. 343 * @return Size of uncompressed data. 344 */ 345 private int benchmarkEncoder(int previousTotalSize, EncodedDataBlock codec) { 346 int prevTotalSize = previousTotalSize; 347 int totalSize = 0; 348 349 // decompression time 350 List<Long> durations = new ArrayList<>(); 351 for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) { 352 totalSize = 0; 353 354 Iterator<Cell> it; 355 356 it = codec.getIterator(HFileBlock.headerSize(useHBaseChecksum)); 357 358 // count only the algorithm time, without memory allocations 359 // (expect first time) 360 final long startTime = System.nanoTime(); 361 while (it.hasNext()) { 362 totalSize += KeyValueUtil.ensureKeyValue(it.next()).getLength(); 363 } 364 final long finishTime = System.nanoTime(); 365 if (itTime >= benchmarkNOmit) { 366 durations.add(finishTime - startTime); 367 } 368 369 if (prevTotalSize != -1 && prevTotalSize != totalSize) { 370 throw new IllegalStateException(String.format( 371 "Algorithm '%s' decoded data to different size", codec.toString())); 372 } 373 prevTotalSize = totalSize; 374 } 375 376 List<Long> encodingDurations = new ArrayList<>(); 377 for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) { 378 final long startTime = System.nanoTime(); 379 codec.encodeData(); 380 final long finishTime = System.nanoTime(); 381 if (itTime >= benchmarkNOmit) { 382 encodingDurations.add(finishTime - startTime); 383 } 384 } 385 386 System.out.println(codec.toString() + ":"); 387 printBenchmarkResult(totalSize, encodingDurations, Manipulation.ENCODING); 388 printBenchmarkResult(totalSize, durations, Manipulation.DECODING); 389 System.out.println(); 390 391 return prevTotalSize; 392 } 393 394 private void benchmarkDefaultCompression(int totalSize, byte[] rawBuffer) 395 throws IOException { 396 benchmarkAlgorithm(compressionAlgorithm, 397 compressionAlgorithmName.toUpperCase(Locale.ROOT), rawBuffer, 0, totalSize); 398 } 399 400 /** 401 * Check decompress performance of a given algorithm and print it. 402 * @param algorithm Compression algorithm. 403 * @param name Name of algorithm. 404 * @param buffer Buffer to be compressed. 405 * @param offset Position of the beginning of the data. 406 * @param length Length of data in buffer. 407 * @throws IOException 408 */ 409 public void benchmarkAlgorithm(Compression.Algorithm algorithm, String name, 410 byte[] buffer, int offset, int length) throws IOException { 411 System.out.println(name + ":"); 412 413 // compress it 414 List<Long> compressDurations = new ArrayList<>(); 415 ByteArrayOutputStream compressedStream = new ByteArrayOutputStream(); 416 CompressionOutputStream compressingStream = 417 algorithm.createPlainCompressionStream(compressedStream, compressor); 418 try { 419 for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) { 420 final long startTime = System.nanoTime(); 421 // The compressedStream should reset before compressingStream resetState since in GZ 422 // resetStatue will write header in the outputstream. 423 compressedStream.reset(); 424 compressingStream.resetState(); 425 compressingStream.write(buffer, offset, length); 426 compressingStream.flush(); 427 compressedStream.toByteArray(); 428 429 final long finishTime = System.nanoTime(); 430 431 // add time record 432 if (itTime >= benchmarkNOmit) { 433 compressDurations.add(finishTime - startTime); 434 } 435 } 436 } catch (IOException e) { 437 throw new RuntimeException(String.format( 438 "Benchmark, or encoding algorithm '%s' cause some stream problems", 439 name), e); 440 } 441 compressingStream.close(); 442 printBenchmarkResult(length, compressDurations, Manipulation.COMPRESSION); 443 444 byte[] compBuffer = compressedStream.toByteArray(); 445 446 // uncompress it several times and measure performance 447 List<Long> durations = new ArrayList<>(); 448 for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) { 449 final long startTime = System.nanoTime(); 450 byte[] newBuf = new byte[length + 1]; 451 452 try { 453 ByteArrayInputStream downStream = new ByteArrayInputStream(compBuffer, 454 0, compBuffer.length); 455 InputStream decompressedStream = algorithm.createDecompressionStream( 456 downStream, decompressor, 0); 457 458 int destOffset = 0; 459 int nextChunk; 460 while ((nextChunk = decompressedStream.available()) > 0) { 461 destOffset += decompressedStream.read(newBuf, destOffset, nextChunk); 462 } 463 decompressedStream.close(); 464 465 } catch (IOException e) { 466 throw new RuntimeException(String.format( 467 "Decoding path in '%s' algorithm cause exception ", name), e); 468 } 469 470 final long finishTime = System.nanoTime(); 471 472 // check correctness 473 if (0 != Bytes.compareTo(buffer, 0, length, newBuf, 0, length)) { 474 int prefix = 0; 475 for(; prefix < buffer.length && prefix < newBuf.length; ++prefix) { 476 if (buffer[prefix] != newBuf[prefix]) { 477 break; 478 } 479 } 480 throw new RuntimeException(String.format( 481 "Algorithm '%s' is corrupting the data", name)); 482 } 483 484 // add time record 485 if (itTime >= benchmarkNOmit) { 486 durations.add(finishTime - startTime); 487 } 488 } 489 printBenchmarkResult(length, durations, Manipulation.DECOMPRESSION); 490 System.out.println(); 491 } 492 493 private static final double BYTES_IN_MB = 1024 * 1024.0; 494 private static final double NS_IN_SEC = 1000.0 * 1000.0 * 1000.0; 495 private static final double MB_SEC_COEF = NS_IN_SEC / BYTES_IN_MB; 496 497 private static void printBenchmarkResult(int totalSize, 498 List<Long> durationsInNanoSec, Manipulation manipulation) { 499 final int n = durationsInNanoSec.size(); 500 long meanTime = 0; 501 for (long time : durationsInNanoSec) { 502 meanTime += time; 503 } 504 meanTime /= n; 505 506 double meanMBPerSec = totalSize * MB_SEC_COEF / meanTime; 507 double mbPerSecSTD = 0; 508 if (n > 0) { 509 for (long time : durationsInNanoSec) { 510 double mbPerSec = totalSize * MB_SEC_COEF / time; 511 double dev = mbPerSec - meanMBPerSec; 512 mbPerSecSTD += dev * dev; 513 } 514 mbPerSecSTD = Math.sqrt(mbPerSecSTD / n); 515 } 516 517 outputTuple(manipulation + " performance", "%6.2f MB/s (+/- %.2f MB/s)", 518 meanMBPerSec, mbPerSecSTD); 519 } 520 521 private static void outputTuple(String caption, String format, 522 Object... values) { 523 if (format.startsWith(INT_FORMAT)) { 524 format = "%s" + format.substring(INT_FORMAT.length()); 525 values[0] = DELIMITED_DECIMAL_FORMAT.format(values[0]); 526 } 527 528 StringBuilder sb = new StringBuilder(); 529 sb.append(" "); 530 sb.append(caption); 531 sb.append(":"); 532 533 String v = String.format(format, values); 534 int padding = 60 - sb.length() - v.length(); 535 for (int i = 0; i < padding; ++i) { 536 sb.append(' '); 537 } 538 sb.append(v); 539 System.out.println(sb); 540 } 541 542 /** 543 * Display statistics of different compression algorithms. 544 * @throws IOException 545 */ 546 public void displayStatistics() throws IOException { 547 final String comprAlgo = compressionAlgorithmName.toUpperCase(Locale.ROOT); 548 long rawBytes = totalKeyLength + totalPrefixLength + totalValueLength; 549 550 System.out.println("Raw data size:"); 551 outputTuple("Raw bytes", INT_FORMAT, rawBytes); 552 outputTuplePct("Key bytes", totalKeyLength); 553 outputTuplePct("Value bytes", totalValueLength); 554 outputTuplePct("KV infrastructure", totalPrefixLength); 555 outputTuplePct("CF overhead", totalCFLength); 556 outputTuplePct("Total key redundancy", totalKeyRedundancyLength); 557 558 int compressedSize = EncodedDataBlock.getCompressedSize( 559 compressionAlgorithm, compressor, rawKVs, 0, rawKVs.length); 560 outputTuple(comprAlgo + " only size", INT_FORMAT, 561 compressedSize); 562 outputSavings(comprAlgo + " only", compressedSize, rawBytes); 563 System.out.println(); 564 565 for (EncodedDataBlock codec : codecs) { 566 System.out.println(codec.toString()); 567 long encodedBytes = codec.getSize(); 568 outputTuple("Encoded bytes", INT_FORMAT, encodedBytes); 569 outputSavings("Key encoding", encodedBytes - totalValueLength, 570 rawBytes - totalValueLength); 571 outputSavings("Total encoding", encodedBytes, rawBytes); 572 573 int encodedCompressedSize = codec.getEncodedCompressedSize( 574 compressionAlgorithm, compressor); 575 outputTuple("Encoding + " + comprAlgo + " size", INT_FORMAT, 576 encodedCompressedSize); 577 outputSavings("Encoding + " + comprAlgo, encodedCompressedSize, rawBytes); 578 outputSavings("Encoding with " + comprAlgo, encodedCompressedSize, 579 compressedSize); 580 581 System.out.println(); 582 } 583 } 584 585 private void outputTuplePct(String caption, long size) { 586 outputTuple(caption, INT_FORMAT + " (" + PCT_FORMAT + ")", 587 size, size * 100.0 / rawKVs.length); 588 } 589 590 private void outputSavings(String caption, long part, long whole) { 591 double pct = 100.0 * (1 - 1.0 * part / whole); 592 double times = whole * 1.0 / part; 593 outputTuple(caption + " savings", PCT_FORMAT + " (%.2f x)", 594 pct, times); 595 } 596 597 /** 598 * Test a data block encoder on the given HFile. Output results to console. 599 * @param kvLimit The limit of KeyValue which will be analyzed. 600 * @param hfilePath an HFile path on the file system. 601 * @param compressionName Compression algorithm used for comparison. 602 * @param doBenchmark Run performance benchmarks. 603 * @param doVerify Verify correctness. 604 * @throws IOException When pathName is incorrect. 605 */ 606 public static void testCodecs(Configuration conf, int kvLimit, 607 String hfilePath, String compressionName, boolean doBenchmark, 608 boolean doVerify) throws IOException { 609 // create environment 610 Path path = new Path(hfilePath); 611 CacheConfig cacheConf = new CacheConfig(conf); 612 FileSystem fs = FileSystem.get(conf); 613 HStoreFile hsf = new HStoreFile(fs, path, conf, cacheConf, BloomType.NONE, true); 614 hsf.initReader(); 615 StoreFileReader reader = hsf.getReader(); 616 reader.loadFileInfo(); 617 KeyValueScanner scanner = reader.getStoreFileScanner(true, true, 618 false, hsf.getMaxMemStoreTS(), 0, false); 619 USE_TAG = reader.getHFileReader().getFileContext().isIncludesTags(); 620 // run the utilities 621 DataBlockEncodingTool comp = new DataBlockEncodingTool(compressionName); 622 int majorVersion = reader.getHFileVersion(); 623 comp.useHBaseChecksum = majorVersion > 2 || 624 (majorVersion == 2 && 625 reader.getHFileMinorVersion() >= HFileReaderImpl.MINOR_VERSION_WITH_CHECKSUM); 626 comp.checkStatistics(scanner, kvLimit); 627 if (doVerify) { 628 comp.verifyCodecs(scanner, kvLimit); 629 } 630 if (doBenchmark) { 631 comp.benchmarkCodecs(); 632 } 633 comp.displayStatistics(); 634 635 // cleanup 636 scanner.close(); 637 reader.close(cacheConf.shouldEvictOnClose()); 638 } 639 640 private static void printUsage(Options options) { 641 System.err.println("Usage:"); 642 System.err.println(String.format("./hbase %s <options>", 643 DataBlockEncodingTool.class.getName())); 644 System.err.println("Options:"); 645 for (Object it : options.getOptions()) { 646 Option opt = (Option) it; 647 if (opt.hasArg()) { 648 System.err.println(String.format("-%s %s: %s", opt.getOpt(), 649 opt.getArgName(), opt.getDescription())); 650 } else { 651 System.err.println(String.format("-%s: %s", opt.getOpt(), 652 opt.getDescription())); 653 } 654 } 655 } 656 657 /** 658 * A command line interface to benchmarks. Parses command-line arguments and 659 * runs the appropriate benchmarks. 660 * @param args Should have length at least 1 and holds the file path to HFile. 661 * @throws IOException If you specified the wrong file. 662 */ 663 public static void main(final String[] args) throws IOException { 664 // set up user arguments 665 Options options = new Options(); 666 options.addOption(OPT_HFILE_NAME, true, "HFile to analyse (REQUIRED)"); 667 options.getOption(OPT_HFILE_NAME).setArgName("FILENAME"); 668 options.addOption(OPT_KV_LIMIT, true, 669 "Maximum number of KeyValues to process. A benchmark stops running " + 670 "after iterating over this many KV pairs."); 671 options.getOption(OPT_KV_LIMIT).setArgName("NUMBER"); 672 options.addOption(OPT_MEASURE_THROUGHPUT, false, 673 "Measure read throughput"); 674 options.addOption(OPT_OMIT_CORRECTNESS_TEST, false, 675 "Omit corectness tests."); 676 options.addOption(OPT_COMPRESSION_ALGORITHM, true, 677 "What kind of compression algorithm use for comparison."); 678 options.addOption(OPT_BENCHMARK_N_TIMES, 679 true, "Number of times to run each benchmark. Default value: " + 680 DEFAULT_BENCHMARK_N_TIMES); 681 options.addOption(OPT_BENCHMARK_N_OMIT, true, 682 "Number of first runs of every benchmark to exclude from " 683 + "statistics (" + DEFAULT_BENCHMARK_N_OMIT 684 + " by default, so that " + "only the last " 685 + (DEFAULT_BENCHMARK_N_TIMES - DEFAULT_BENCHMARK_N_OMIT) 686 + " times are included in statistics.)"); 687 688 // parse arguments 689 CommandLineParser parser = new PosixParser(); 690 CommandLine cmd = null; 691 try { 692 cmd = parser.parse(options, args); 693 } catch (ParseException e) { 694 System.err.println("Could not parse arguments!"); 695 System.exit(-1); 696 return; // avoid warning 697 } 698 699 int kvLimit = Integer.MAX_VALUE; 700 if (cmd.hasOption(OPT_KV_LIMIT)) { 701 kvLimit = Integer.parseInt(cmd.getOptionValue(OPT_KV_LIMIT)); 702 if (kvLimit <= 0) { 703 LOG.error("KV_LIMIT should not less than 1."); 704 } 705 } 706 707 // basic argument sanity checks 708 if (!cmd.hasOption(OPT_HFILE_NAME)) { 709 LOG.error("Please specify HFile name using the " + OPT_HFILE_NAME 710 + " option"); 711 printUsage(options); 712 System.exit(-1); 713 } 714 715 String pathName = cmd.getOptionValue(OPT_HFILE_NAME); 716 String compressionName = DEFAULT_COMPRESSION.getName(); 717 if (cmd.hasOption(OPT_COMPRESSION_ALGORITHM)) { 718 compressionName = 719 cmd.getOptionValue(OPT_COMPRESSION_ALGORITHM).toLowerCase(Locale.ROOT); 720 } 721 boolean doBenchmark = cmd.hasOption(OPT_MEASURE_THROUGHPUT); 722 boolean doVerify = !cmd.hasOption(OPT_OMIT_CORRECTNESS_TEST); 723 724 if (cmd.hasOption(OPT_BENCHMARK_N_TIMES)) { 725 benchmarkNTimes = Integer.valueOf(cmd.getOptionValue( 726 OPT_BENCHMARK_N_TIMES)); 727 } 728 if (cmd.hasOption(OPT_BENCHMARK_N_OMIT)) { 729 benchmarkNOmit = 730 Integer.valueOf(cmd.getOptionValue(OPT_BENCHMARK_N_OMIT)); 731 } 732 if (benchmarkNTimes < benchmarkNOmit) { 733 LOG.error("The number of times to run each benchmark (" 734 + benchmarkNTimes 735 + ") must be greater than the number of benchmark runs to exclude " 736 + "from statistics (" + benchmarkNOmit + ")"); 737 System.exit(1); 738 } 739 LOG.info("Running benchmark " + benchmarkNTimes + " times. " + 740 "Excluding the first " + benchmarkNOmit + " times from statistics."); 741 742 final Configuration conf = HBaseConfiguration.create(); 743 testCodecs(conf, kvLimit, pathName, compressionName, doBenchmark, doVerify); 744 } 745 746}