View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.util;
21  
22  import java.io.DataInput;
23  import java.io.DataOutput;
24  import java.io.IOException;
25  import java.nio.ByteBuffer;
26  import java.text.NumberFormat;
27  import java.util.Random;
28  
29  import org.apache.hadoop.hbase.classification.InterfaceAudience;
30  import org.apache.hadoop.hbase.KeyValue;
31  import org.apache.hadoop.hbase.KeyValue.KVComparator;
32  import org.apache.hadoop.io.Writable;
33  
34  /**
35   * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
36   * <p>
37   * The Bloom filter is a data structure that was introduced in 1970 and that has
38   * been adopted by the networking research community in the past decade thanks
39   * to the bandwidth efficiencies that it offers for the transmission of set
40   * membership information between networked hosts. A sender encodes the
41   * information into a bit vector, the Bloom filter, that is more compact than a
42   * conventional representation. Computation and space costs for construction are
43   * linear in the number of elements. The receiver uses the filter to test
44   * whether various elements are members of the set. Though the filter will
45   * occasionally return a false positive, it will never return a false negative.
46   * When creating the filter, the sender can choose its desired point in a
47   * trade-off between the false positive rate and the size.
48   *
49   * <p>
50   * Originally inspired by <a href="http://www.one-lab.org">European Commission
51   * One-Lab Project 034819</a>.
52   *
53   * Bloom filters are very sensitive to the number of elements inserted into
54   * them. For HBase, the number of entries depends on the size of the data stored
55   * in the column. Currently the default region size is 256MB, so entry count ~=
56   * 256MB / (average value size for column). Despite this rule of thumb, there is
57   * no efficient way to calculate the entry count after compactions. Therefore,
58   * it is often easier to use a dynamic bloom filter that will add extra space
59   * instead of allowing the error rate to grow.
60   *
61   * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey
62   * .pdf )
63   *
64   * m denotes the number of bits in the Bloom filter (bitSize) n denotes the
65   * number of elements inserted into the Bloom filter (maxKeys) k represents the
66   * number of hash functions used (nbHash) e represents the desired false
67   * positive rate for the bloom (err)
68   *
69   * If we fix the error rate (e) and know the number of entries, then the optimal
70   * bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / ln(0.6185)
71   *
72   * The probability of false positives is minimized when k = m/n ln(2).
73   *
74   * @see BloomFilter The general behavior of a filter
75   *
76   * @see <a
77   *      href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">
78   *      Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
79   */
80  @InterfaceAudience.Private
81  public class ByteBloomFilter implements BloomFilter, BloomFilterWriter {
82  
83    /** Current file format version */
84    public static final int VERSION = 1;
85  
86    /** Bytes (B) in the array. This actually has to fit into an int. */
87    protected long byteSize;
88    /** Number of hash functions */
89    protected int hashCount;
90    /** Hash type */
91    protected final int hashType;
92    /** Hash Function */
93    protected final Hash hash;
94    /** Keys currently in the bloom */
95    protected int keyCount;
96    /** Max Keys expected for the bloom */
97    protected int maxKeys;
98    /** Bloom bits */
99    protected ByteBuffer bloom;
100 
101   /** Record separator for the Bloom filter statistics human-readable string */
102   public static final String STATS_RECORD_SEP = "; ";
103 
104   /**
105    * Used in computing the optimal Bloom filter size. This approximately equals
106    * 0.480453.
107    */
108   public static final double LOG2_SQUARED = Math.log(2) * Math.log(2);
109 
110   /**
111    * A random number generator to use for "fake lookups" when testing to
112    * estimate the ideal false positive rate.
113    */
114   private static Random randomGeneratorForTest;
115 
116   /** Bit-value lookup array to prevent doing the same work over and over */
117   private static final byte [] bitvals = {
118     (byte) 0x01,
119     (byte) 0x02,
120     (byte) 0x04,
121     (byte) 0x08,
122     (byte) 0x10,
123     (byte) 0x20,
124     (byte) 0x40,
125     (byte) 0x80
126   };
127 
128   /**
129    * Loads bloom filter meta data from file input.
130    * @param meta stored bloom meta data
131    * @throws IllegalArgumentException meta data is invalid
132    */
133   public ByteBloomFilter(DataInput meta)
134       throws IOException, IllegalArgumentException {
135     this.byteSize = meta.readInt();
136     this.hashCount = meta.readInt();
137     this.hashType = meta.readInt();
138     this.keyCount = meta.readInt();
139     this.maxKeys = this.keyCount;
140 
141     this.hash = Hash.getInstance(this.hashType);
142     if (hash == null) {
143       throw new IllegalArgumentException("Invalid hash type: " + hashType);
144     }
145     sanityCheck();
146   }
147 
148   /**
149    * @param maxKeys
150    * @param errorRate
151    * @return the number of bits for a Bloom filter than can hold the given
152    *         number of keys and provide the given error rate, assuming that the
153    *         optimal number of hash functions is used and it does not have to
154    *         be an integer.
155    */
156   public static long computeBitSize(long maxKeys, double errorRate) {
157     return (long) Math.ceil(maxKeys * (-Math.log(errorRate) / LOG2_SQUARED));
158   }
159 
160   /**
161    * The maximum number of keys we can put into a Bloom filter of a certain
162    * size to maintain the given error rate, assuming the number of hash
163    * functions is chosen optimally and does not even have to be an integer
164    * (hence the "ideal" in the function name).
165    *
166    * @param bitSize
167    * @param errorRate
168    * @return maximum number of keys that can be inserted into the Bloom filter
169    * @see #computeMaxKeys(long, double, int) for a more precise estimate
170    */
171   public static long idealMaxKeys(long bitSize, double errorRate) {
172     // The reason we need to use floor here is that otherwise we might put
173     // more keys in a Bloom filter than is allowed by the target error rate.
174     return (long) (bitSize * (LOG2_SQUARED / -Math.log(errorRate)));
175   }
176 
177   /**
178    * The maximum number of keys we can put into a Bloom filter of a certain
179    * size to get the given error rate, with the given number of hash functions.
180    *
181    * @param bitSize
182    * @param errorRate
183    * @param hashCount
184    * @return the maximum number of keys that can be inserted in a Bloom filter
185    *         to maintain the target error rate, if the number of hash functions
186    *         is provided.
187    */
188   public static long computeMaxKeys(long bitSize, double errorRate,
189       int hashCount) {
190     return (long) (-bitSize * 1.0 / hashCount *
191         Math.log(1 - Math.exp(Math.log(errorRate) / hashCount)));
192   }
193 
194   /**
195    * Computes the error rate for this Bloom filter, taking into account the
196    * actual number of hash functions and keys inserted. The return value of
197    * this function changes as a Bloom filter is being populated. Used for
198    * reporting the actual error rate of compound Bloom filters when writing
199    * them out.
200    *
201    * @return error rate for this particular Bloom filter
202    */
203   public double actualErrorRate() {
204     return actualErrorRate(keyCount, byteSize * 8, hashCount);
205   }
206 
207   /**
208    * Computes the actual error rate for the given number of elements, number
209    * of bits, and number of hash functions. Taken directly from the
210    * <a href=
211    * "http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives"
212    * > Wikipedia Bloom filter article</a>.
213    *
214    * @param maxKeys
215    * @param bitSize
216    * @param functionCount
217    * @return the actual error rate
218    */
219   public static double actualErrorRate(long maxKeys, long bitSize,
220       int functionCount) {
221     return Math.exp(Math.log(1 - Math.exp(-functionCount * maxKeys * 1.0
222         / bitSize)) * functionCount);
223   }
224 
225   /**
226    * Increases the given byte size of a Bloom filter until it can be folded by
227    * the given factor.
228    *
229    * @param bitSize
230    * @param foldFactor
231    * @return Foldable byte size
232    */
233   public static int computeFoldableByteSize(long bitSize, int foldFactor) {
234     long byteSizeLong = (bitSize + 7) / 8;
235     int mask = (1 << foldFactor) - 1;
236     if ((mask & byteSizeLong) != 0) {
237       byteSizeLong >>= foldFactor;
238       ++byteSizeLong;
239       byteSizeLong <<= foldFactor;
240     }
241     if (byteSizeLong > Integer.MAX_VALUE) {
242       throw new IllegalArgumentException("byteSize=" + byteSizeLong + " too "
243           + "large for bitSize=" + bitSize + ", foldFactor=" + foldFactor);
244     }
245     return (int) byteSizeLong;
246   }
247 
248   private static int optimalFunctionCount(int maxKeys, long bitSize) {
249     long i = bitSize / maxKeys;
250     double result = Math.ceil(Math.log(2) * i);
251     if (result > Integer.MAX_VALUE){
252       throw new IllegalArgumentException("result too large for integer value.");
253     }
254     return (int)result;
255   }
256 
257   /** Private constructor used by other constructors. */
258   private ByteBloomFilter(int hashType) {
259     this.hashType = hashType;
260     this.hash = Hash.getInstance(hashType);
261   }
262 
263   /**
264    * Determines & initializes bloom filter meta data from user config. Call
265    * {@link #allocBloom()} to allocate bloom filter data.
266    *
267    * @param maxKeys Maximum expected number of keys that will be stored in this
268    *          bloom
269    * @param errorRate Desired false positive error rate. Lower rate = more
270    *          storage required
271    * @param hashType Type of hash function to use
272    * @param foldFactor When finished adding entries, you may be able to 'fold'
273    *          this bloom to save space. Tradeoff potentially excess bytes in
274    *          bloom for ability to fold if keyCount is exponentially greater
275    *          than maxKeys.
276    * @throws IllegalArgumentException
277    */
278   public ByteBloomFilter(int maxKeys, double errorRate, int hashType,
279       int foldFactor) throws IllegalArgumentException {
280     this(hashType);
281 
282     long bitSize = computeBitSize(maxKeys, errorRate);
283     hashCount = optimalFunctionCount(maxKeys, bitSize);
284     this.maxKeys = maxKeys;
285 
286     // increase byteSize so folding is possible
287     byteSize = computeFoldableByteSize(bitSize, foldFactor);
288 
289     sanityCheck();
290   }
291 
292   /**
293    * Creates a Bloom filter of the given size.
294    *
295    * @param byteSizeHint the desired number of bytes for the Bloom filter bit
296    *          array. Will be increased so that folding is possible.
297    * @param errorRate target false positive rate of the Bloom filter
298    * @param hashType Bloom filter hash function type
299    * @param foldFactor
300    * @return the new Bloom filter of the desired size
301    */
302   public static ByteBloomFilter createBySize(int byteSizeHint,
303       double errorRate, int hashType, int foldFactor) {
304     ByteBloomFilter bbf = new ByteBloomFilter(hashType);
305 
306     bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8L, foldFactor);
307     long bitSize = bbf.byteSize * 8;
308     bbf.maxKeys = (int) idealMaxKeys(bitSize, errorRate);
309     bbf.hashCount = optimalFunctionCount(bbf.maxKeys, bitSize);
310 
311     // Adjust max keys to bring error rate closer to what was requested,
312     // because byteSize was adjusted to allow for folding, and hashCount was
313     // rounded.
314     bbf.maxKeys = (int) computeMaxKeys(bitSize, errorRate, bbf.hashCount);
315 
316     return bbf;
317   }
318 
319   /**
320    * Creates another similar Bloom filter. Does not copy the actual bits, and
321    * sets the new filter's key count to zero.
322    *
323    * @return a Bloom filter with the same configuration as this
324    */
325   public ByteBloomFilter createAnother() {
326     ByteBloomFilter bbf = new ByteBloomFilter(hashType);
327     bbf.byteSize = byteSize;
328     bbf.hashCount = hashCount;
329     bbf.maxKeys = maxKeys;
330     return bbf;
331   }
332 
333   @Override
334   public void allocBloom() {
335     if (this.bloom != null) {
336       throw new IllegalArgumentException("can only create bloom once.");
337     }
338     this.bloom = ByteBuffer.allocate((int)this.byteSize);
339     assert this.bloom.hasArray();
340   }
341 
342   void sanityCheck() throws IllegalArgumentException {
343     if(0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) {
344       throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize);
345     }
346 
347     if(this.hashCount <= 0) {
348       throw new IllegalArgumentException("Hash function count must be > 0");
349     }
350 
351     if (this.hash == null) {
352       throw new IllegalArgumentException("hashType must be known");
353     }
354 
355     if (this.keyCount < 0) {
356       throw new IllegalArgumentException("must have positive keyCount");
357     }
358   }
359 
360   void bloomCheck(ByteBuffer bloom)  throws IllegalArgumentException {
361     if (this.byteSize != bloom.limit()) {
362       throw new IllegalArgumentException(
363           "Configured bloom length should match actual length");
364     }
365   }
366 
367   public void add(byte [] buf) {
368     add(buf, 0, buf.length);
369   }
370 
371   @Override
372   public void add(byte [] buf, int offset, int len) {
373     /*
374      * For faster hashing, use combinatorial generation
375      * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
376      */
377     int hash1 = this.hash.hash(buf, offset, len, 0);
378     int hash2 = this.hash.hash(buf, offset, len, hash1);
379 
380     for (int i = 0; i < this.hashCount; i++) {
381       long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
382       set(hashLoc);
383     }
384 
385     ++this.keyCount;
386   }
387 
388   /** Should only be used in tests */
389   boolean contains(byte [] buf) {
390     return contains(buf, 0, buf.length, this.bloom);
391   }
392 
393   /** Should only be used in tests */
394   boolean contains(byte [] buf, int offset, int length) {
395     return contains(buf, offset, length, bloom);
396   }
397 
398   /** Should only be used in tests */
399   boolean contains(byte[] buf, ByteBuffer bloom) {
400     return contains(buf, 0, buf.length, bloom);
401   }
402 
403   @Override
404   public boolean contains(byte[] buf, int offset, int length,
405       ByteBuffer theBloom) {
406     if (theBloom == null) {
407       // In a version 1 HFile Bloom filter data is stored in a separate meta
408       // block which is loaded on demand, but in version 2 it is pre-loaded.
409       // We want to use the same API in both cases.
410       theBloom = bloom;
411     }
412 
413     if (theBloom.limit() != byteSize) {
414       throw new IllegalArgumentException("Bloom does not match expected size:"
415           + " theBloom.limit()=" + theBloom.limit() + ", byteSize=" + byteSize);
416     }
417 
418     return contains(buf, offset, length, theBloom, 0, (int) byteSize, hash, hashCount);
419   }
420 
421   public static boolean contains(byte[] buf, int offset, int length,
422       ByteBuffer bloomBuf, int bloomOffset, int bloomSize, Hash hash,
423       int hashCount) {
424 
425     int hash1 = hash.hash(buf, offset, length, 0);
426     int hash2 = hash.hash(buf, offset, length, hash1);
427     int bloomBitSize = bloomSize << 3;
428     
429     if (randomGeneratorForTest == null) {
430       // Production mode.
431       int compositeHash = hash1;
432       for (int i = 0; i < hashCount; i++) {
433         int hashLoc = Math.abs(compositeHash % bloomBitSize);
434         compositeHash += hash2;
435         if (!get(hashLoc, bloomBuf, bloomOffset)) {
436           return false;
437         }
438       }
439     } else {
440       // Test mode with "fake lookups" to estimate "ideal false positive rate".
441       for (int i = 0; i < hashCount; i++) {
442         int hashLoc = randomGeneratorForTest.nextInt(bloomBitSize);
443         if (!get(hashLoc, bloomBuf, bloomOffset)){
444           return false;
445         }
446       }
447     }
448     return true;
449   }
450 
451   //---------------------------------------------------------------------------
452   /** Private helpers */
453 
454   /**
455    * Set the bit at the specified index to 1.
456    *
457    * @param pos index of bit
458    */
459   void set(long pos) {
460     int bytePos = (int)(pos / 8);
461     int bitPos = (int)(pos % 8);
462     byte curByte = bloom.get(bytePos);
463     curByte |= bitvals[bitPos];
464     bloom.put(bytePos, curByte);
465   }
466 
467   /**
468    * Check if bit at specified index is 1.
469    *
470    * @param pos index of bit
471    * @return true if bit at specified index is 1, false if 0.
472    */
473   static boolean get(int pos, ByteBuffer bloomBuf, int bloomOffset) {
474     int bytePos = pos >> 3; //pos / 8
475     int bitPos = pos & 0x7; //pos % 8
476     // TODO access this via Util API which can do Unsafe access if possible(?)
477     byte curByte = bloomBuf.get(bloomOffset + bytePos);
478     curByte &= bitvals[bitPos];
479     return (curByte != 0);
480   }
481 
482   @Override
483   public long getKeyCount() {
484     return keyCount;
485   }
486 
487   @Override
488   public long getMaxKeys() {
489     return maxKeys;
490   }
491 
492   @Override
493   public long getByteSize() {
494     return byteSize;
495   }
496 
497   public int getHashType() {
498     return hashType;
499   }
500 
501   @Override
502   public void compactBloom() {
503     // see if the actual size is exponentially smaller than expected.
504     if (this.keyCount > 0 && this.bloom.hasArray()) {
505       int pieces = 1;
506       int newByteSize = (int)this.byteSize;
507       int newMaxKeys = this.maxKeys;
508 
509       // while exponentially smaller & folding is lossless
510       while ( (newByteSize & 1) == 0 && newMaxKeys > (this.keyCount<<1) ) {
511         pieces <<= 1;
512         newByteSize >>= 1;
513         newMaxKeys >>= 1;
514       }
515 
516       // if we should fold these into pieces
517       if (pieces > 1) {
518         byte[] array = this.bloom.array();
519         int start = this.bloom.arrayOffset();
520         int end = start + newByteSize;
521         int off = end;
522         for(int p = 1; p < pieces; ++p) {
523           for(int pos = start; pos < end; ++pos) {
524             array[pos] |= array[off++];
525           }
526         }
527         // folding done, only use a subset of this array
528         this.bloom.rewind();
529         this.bloom.limit(newByteSize);
530         this.bloom = this.bloom.slice();
531         this.byteSize = newByteSize;
532         this.maxKeys = newMaxKeys;
533       }
534     }
535   }
536 
537 
538   //---------------------------------------------------------------------------
539 
540   /**
541    * Writes just the bloom filter to the output array
542    * @param out OutputStream to place bloom
543    * @throws IOException Error writing bloom array
544    */
545   public void writeBloom(final DataOutput out) throws IOException {
546     if (!this.bloom.hasArray()) {
547       throw new IOException("Only writes ByteBuffer with underlying array.");
548     }
549     out.write(bloom.array(), bloom.arrayOffset(), bloom.limit());
550   }
551 
552   @Override
553   public Writable getMetaWriter() {
554     return new MetaWriter();
555   }
556 
557   @Override
558   public Writable getDataWriter() {
559     return new DataWriter();
560   }
561 
562   private class MetaWriter implements Writable {
563     protected MetaWriter() {}
564     @Override
565     public void readFields(DataInput arg0) throws IOException {
566       throw new IOException("Cant read with this class.");
567     }
568 
569     @Override
570     public void write(DataOutput out) throws IOException {
571       out.writeInt(VERSION);
572       out.writeInt((int) byteSize);
573       out.writeInt(hashCount);
574       out.writeInt(hashType);
575       out.writeInt(keyCount);
576     }
577   }
578 
579   private class DataWriter implements Writable {
580     protected DataWriter() {}
581     @Override
582     public void readFields(DataInput arg0) throws IOException {
583       throw new IOException("Cant read with this class.");
584     }
585 
586     @Override
587     public void write(DataOutput out) throws IOException {
588       writeBloom(out);
589     }
590   }
591 
592   public int getHashCount() {
593     return hashCount;
594   }
595 
596   @Override
597   public boolean supportsAutoLoading() {
598     return bloom != null;
599   }
600 
601   public static void setFakeLookupMode(boolean enabled) {
602     if (enabled) {
603       randomGeneratorForTest = new Random(283742987L);
604     } else {
605       randomGeneratorForTest = null;
606     }
607   }
608 
609   /**
610    * {@inheritDoc}
611    * Just concatenate row and column by default. May return the original row
612    * buffer if the column qualifier is empty.
613    */
614   @Override
615   public byte[] createBloomKey(byte[] rowBuf, int rowOffset, int rowLen,
616       byte[] qualBuf, int qualOffset, int qualLen) {
617     // Optimize the frequent case when only the row is provided.
618     if (qualLen <= 0 && rowOffset == 0 && rowLen == rowBuf.length)
619       return rowBuf;
620 
621     byte [] result = new byte[rowLen + qualLen];
622     System.arraycopy(rowBuf, rowOffset, result, 0,  rowLen);
623     if (qualLen > 0)
624       System.arraycopy(qualBuf, qualOffset, result, rowLen, qualLen);
625     return result;
626   }
627 
628   @Override
629   public KVComparator getComparator() {
630 //    return Bytes.BYTES_RAWCOMPARATOR;
631     return KeyValue.RAW_COMPARATOR;
632   }
633 
634   /**
635    * A human-readable string with statistics for the given Bloom filter.
636    *
637    * @param bloomFilter the Bloom filter to output statistics for;
638    * @return a string consisting of "&lt;key&gt;: &lt;value&gt;" parts
639    *         separated by {@link #STATS_RECORD_SEP}.
640    */
641   public static String formatStats(BloomFilterBase bloomFilter) {
642     StringBuilder sb = new StringBuilder();
643     long k = bloomFilter.getKeyCount();
644     long m = bloomFilter.getMaxKeys();
645 
646     sb.append("BloomSize: " + bloomFilter.getByteSize() + STATS_RECORD_SEP);
647     sb.append("No of Keys in bloom: " + k + STATS_RECORD_SEP);
648     sb.append("Max Keys for bloom: " + m);
649     if (m > 0) {
650       sb.append(STATS_RECORD_SEP + "Percentage filled: "
651           + NumberFormat.getPercentInstance().format(k * 1.0 / m));
652     }
653     return sb.toString();
654   }
655 
656   @Override
657   public String toString() {
658     return formatStats(this) + STATS_RECORD_SEP + "Actual error rate: "
659         + String.format("%.8f", actualErrorRate());
660   }
661 
662 }