View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.util;
21  
22  import java.io.DataInput;
23  import java.io.DataOutput;
24  import java.io.IOException;
25  import java.nio.ByteBuffer;
26  import java.text.NumberFormat;
27  import java.util.Random;
28  
29  import com.google.common.annotations.VisibleForTesting;
30  import org.apache.hadoop.hbase.classification.InterfaceAudience;
31  import org.apache.hadoop.hbase.KeyValue;
32  import org.apache.hadoop.hbase.KeyValue.KVComparator;
33  import org.apache.hadoop.io.Writable;
34  
35  /**
36   * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
37   * <p>
38   * The Bloom filter is a data structure that was introduced in 1970 and that has
39   * been adopted by the networking research community in the past decade thanks
40   * to the bandwidth efficiencies that it offers for the transmission of set
41   * membership information between networked hosts. A sender encodes the
42   * information into a bit vector, the Bloom filter, that is more compact than a
43   * conventional representation. Computation and space costs for construction are
44   * linear in the number of elements. The receiver uses the filter to test
45   * whether various elements are members of the set. Though the filter will
46   * occasionally return a false positive, it will never return a false negative.
47   * When creating the filter, the sender can choose its desired point in a
48   * trade-off between the false positive rate and the size.
49   *
50   * <p>
51   * Originally inspired by <a href="http://www.one-lab.org">European Commission
52   * One-Lab Project 034819</a>.
53   *
54   * Bloom filters are very sensitive to the number of elements inserted into
55   * them. For HBase, the number of entries depends on the size of the data stored
56   * in the column. Currently the default region size is 256MB, so entry count ~=
57   * 256MB / (average value size for column). Despite this rule of thumb, there is
58   * no efficient way to calculate the entry count after compactions. Therefore,
59   * it is often easier to use a dynamic bloom filter that will add extra space
60   * instead of allowing the error rate to grow.
61   *
62   * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey
63   * .pdf )
64   *
65   * m denotes the number of bits in the Bloom filter (bitSize) n denotes the
66   * number of elements inserted into the Bloom filter (maxKeys) k represents the
67   * number of hash functions used (nbHash) e represents the desired false
68   * positive rate for the bloom (err)
69   *
70   * If we fix the error rate (e) and know the number of entries, then the optimal
71   * bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / ln(0.6185)
72   *
73   * The probability of false positives is minimized when k = m/n ln(2).
74   *
75   * @see BloomFilter The general behavior of a filter
76   *
77   * @see <a
78   *      href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">
79   *      Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
80   */
81  @InterfaceAudience.Private
82  public class ByteBloomFilter implements BloomFilter, BloomFilterWriter {
83  
84    /** Current file format version */
85    public static final int VERSION = 1;
86  
87    /** Bytes (B) in the array. This actually has to fit into an int. */
88    protected long byteSize;
89    /** Number of hash functions */
90    protected int hashCount;
91    /** Hash type */
92    protected final int hashType;
93    /** Hash Function */
94    protected final Hash hash;
95    /** Keys currently in the bloom */
96    protected int keyCount;
97    /** Max Keys expected for the bloom */
98    protected int maxKeys;
99    /** Bloom bits */
100   protected ByteBuffer bloom;
101 
102   /** Record separator for the Bloom filter statistics human-readable string */
103   public static final String STATS_RECORD_SEP = "; ";
104 
105   /**
106    * Used in computing the optimal Bloom filter size. This approximately equals
107    * 0.480453.
108    */
109   public static final double LOG2_SQUARED = Math.log(2) * Math.log(2);
110 
111   /**
112    * A random number generator to use for "fake lookups" when testing to
113    * estimate the ideal false positive rate.
114    */
115   private static Random randomGeneratorForTest;
116 
117   /** Bit-value lookup array to prevent doing the same work over and over */
118   private static final byte [] bitvals = {
119     (byte) 0x01,
120     (byte) 0x02,
121     (byte) 0x04,
122     (byte) 0x08,
123     (byte) 0x10,
124     (byte) 0x20,
125     (byte) 0x40,
126     (byte) 0x80
127   };
128 
129   /**
130    * Loads bloom filter meta data from file input.
131    * @param meta stored bloom meta data
132    * @throws IllegalArgumentException meta data is invalid
133    */
134   public ByteBloomFilter(DataInput meta)
135       throws IOException, IllegalArgumentException {
136     this.byteSize = meta.readInt();
137     this.hashCount = meta.readInt();
138     this.hashType = meta.readInt();
139     this.keyCount = meta.readInt();
140     this.maxKeys = this.keyCount;
141 
142     this.hash = Hash.getInstance(this.hashType);
143     if (hash == null) {
144       throw new IllegalArgumentException("Invalid hash type: " + hashType);
145     }
146     sanityCheck();
147   }
148 
149   /**
150    * @param maxKeys
151    * @param errorRate
152    * @return the number of bits for a Bloom filter than can hold the given
153    *         number of keys and provide the given error rate, assuming that the
154    *         optimal number of hash functions is used and it does not have to
155    *         be an integer.
156    */
157   public static long computeBitSize(long maxKeys, double errorRate) {
158     return (long) Math.ceil(maxKeys * (-Math.log(errorRate) / LOG2_SQUARED));
159   }
160 
161   /**
162    * The maximum number of keys we can put into a Bloom filter of a certain
163    * size to maintain the given error rate, assuming the number of hash
164    * functions is chosen optimally and does not even have to be an integer
165    * (hence the "ideal" in the function name).
166    *
167    * @param bitSize
168    * @param errorRate
169    * @return maximum number of keys that can be inserted into the Bloom filter
170    * @see #computeMaxKeys(long, double, int) for a more precise estimate
171    */
172   public static long idealMaxKeys(long bitSize, double errorRate) {
173     // The reason we need to use floor here is that otherwise we might put
174     // more keys in a Bloom filter than is allowed by the target error rate.
175     return (long) (bitSize * (LOG2_SQUARED / -Math.log(errorRate)));
176   }
177 
178   /**
179    * The maximum number of keys we can put into a Bloom filter of a certain
180    * size to get the given error rate, with the given number of hash functions.
181    *
182    * @param bitSize
183    * @param errorRate
184    * @param hashCount
185    * @return the maximum number of keys that can be inserted in a Bloom filter
186    *         to maintain the target error rate, if the number of hash functions
187    *         is provided.
188    */
189   public static long computeMaxKeys(long bitSize, double errorRate,
190       int hashCount) {
191     return (long) (-bitSize * 1.0 / hashCount *
192         Math.log(1 - Math.exp(Math.log(errorRate) / hashCount)));
193   }
194 
195   /**
196    * Computes the error rate for this Bloom filter, taking into account the
197    * actual number of hash functions and keys inserted. The return value of
198    * this function changes as a Bloom filter is being populated. Used for
199    * reporting the actual error rate of compound Bloom filters when writing
200    * them out.
201    *
202    * @return error rate for this particular Bloom filter
203    */
204   public double actualErrorRate() {
205     return actualErrorRate(keyCount, byteSize * 8, hashCount);
206   }
207 
208   /**
209    * Computes the actual error rate for the given number of elements, number
210    * of bits, and number of hash functions. Taken directly from the
211    * <a href=
212    * "http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives"
213    * > Wikipedia Bloom filter article</a>.
214    *
215    * @param maxKeys
216    * @param bitSize
217    * @param functionCount
218    * @return the actual error rate
219    */
220   public static double actualErrorRate(long maxKeys, long bitSize,
221       int functionCount) {
222     return Math.exp(Math.log(1 - Math.exp(-functionCount * maxKeys * 1.0
223         / bitSize)) * functionCount);
224   }
225 
226   /**
227    * Increases the given byte size of a Bloom filter until it can be folded by
228    * the given factor.
229    *
230    * @param bitSize
231    * @param foldFactor
232    * @return Foldable byte size
233    */
234   public static int computeFoldableByteSize(long bitSize, int foldFactor) {
235     long byteSizeLong = (bitSize + 7) / 8;
236     int mask = (1 << foldFactor) - 1;
237     if ((mask & byteSizeLong) != 0) {
238       byteSizeLong >>= foldFactor;
239       ++byteSizeLong;
240       byteSizeLong <<= foldFactor;
241     }
242     if (byteSizeLong > Integer.MAX_VALUE) {
243       throw new IllegalArgumentException("byteSize=" + byteSizeLong + " too "
244           + "large for bitSize=" + bitSize + ", foldFactor=" + foldFactor);
245     }
246     return (int) byteSizeLong;
247   }
248 
249   private static int optimalFunctionCount(int maxKeys, long bitSize) {
250     long i = bitSize / maxKeys;
251     double result = Math.ceil(Math.log(2) * i);
252     if (result > Integer.MAX_VALUE){
253       throw new IllegalArgumentException("result too large for integer value.");
254     }
255     return (int)result;
256   }
257 
258   /** Private constructor used by other constructors. */
259   private ByteBloomFilter(int hashType) {
260     this.hashType = hashType;
261     this.hash = Hash.getInstance(hashType);
262   }
263 
264   /**
265    * Determines &amp; initializes bloom filter meta data from user config. Call
266    * {@link #allocBloom()} to allocate bloom filter data.
267    *
268    * @param maxKeys Maximum expected number of keys that will be stored in this
269    *          bloom
270    * @param errorRate Desired false positive error rate. Lower rate = more
271    *          storage required
272    * @param hashType Type of hash function to use
273    * @param foldFactor When finished adding entries, you may be able to 'fold'
274    *          this bloom to save space. Tradeoff potentially excess bytes in
275    *          bloom for ability to fold if keyCount is exponentially greater
276    *          than maxKeys.
277    * @throws IllegalArgumentException
278    */
279   public ByteBloomFilter(int maxKeys, double errorRate, int hashType,
280       int foldFactor) throws IllegalArgumentException {
281     this(hashType);
282 
283     long bitSize = computeBitSize(maxKeys, errorRate);
284     hashCount = optimalFunctionCount(maxKeys, bitSize);
285     this.maxKeys = maxKeys;
286 
287     // increase byteSize so folding is possible
288     byteSize = computeFoldableByteSize(bitSize, foldFactor);
289 
290     sanityCheck();
291   }
292 
293   /**
294    * Creates a Bloom filter of the given size.
295    *
296    * @param byteSizeHint the desired number of bytes for the Bloom filter bit
297    *          array. Will be increased so that folding is possible.
298    * @param errorRate target false positive rate of the Bloom filter
299    * @param hashType Bloom filter hash function type
300    * @param foldFactor
301    * @return the new Bloom filter of the desired size
302    */
303   public static ByteBloomFilter createBySize(int byteSizeHint,
304       double errorRate, int hashType, int foldFactor) {
305     ByteBloomFilter bbf = new ByteBloomFilter(hashType);
306 
307     bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8L, foldFactor);
308     long bitSize = bbf.byteSize * 8;
309     bbf.maxKeys = (int) idealMaxKeys(bitSize, errorRate);
310     bbf.hashCount = optimalFunctionCount(bbf.maxKeys, bitSize);
311 
312     // Adjust max keys to bring error rate closer to what was requested,
313     // because byteSize was adjusted to allow for folding, and hashCount was
314     // rounded.
315     bbf.maxKeys = (int) computeMaxKeys(bitSize, errorRate, bbf.hashCount);
316 
317     return bbf;
318   }
319 
320   /**
321    * Creates another similar Bloom filter. Does not copy the actual bits, and
322    * sets the new filter's key count to zero.
323    *
324    * @return a Bloom filter with the same configuration as this
325    */
326   public ByteBloomFilter createAnother() {
327     ByteBloomFilter bbf = new ByteBloomFilter(hashType);
328     bbf.byteSize = byteSize;
329     bbf.hashCount = hashCount;
330     bbf.maxKeys = maxKeys;
331     return bbf;
332   }
333 
334   @Override
335   public void allocBloom() {
336     if (this.bloom != null) {
337       throw new IllegalArgumentException("can only create bloom once.");
338     }
339     this.bloom = ByteBuffer.allocate((int)this.byteSize);
340     assert this.bloom.hasArray();
341   }
342 
343   void sanityCheck() throws IllegalArgumentException {
344     if(0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) {
345       throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize);
346     }
347 
348     if(this.hashCount <= 0) {
349       throw new IllegalArgumentException("Hash function count must be > 0");
350     }
351 
352     if (this.hash == null) {
353       throw new IllegalArgumentException("hashType must be known");
354     }
355 
356     if (this.keyCount < 0) {
357       throw new IllegalArgumentException("must have positive keyCount");
358     }
359   }
360 
361   void bloomCheck(ByteBuffer bloom)  throws IllegalArgumentException {
362     if (this.byteSize != bloom.limit()) {
363       throw new IllegalArgumentException(
364           "Configured bloom length should match actual length");
365     }
366   }
367 
368   public void add(byte [] buf) {
369     add(buf, 0, buf.length);
370   }
371 
372   @Override
373   public void add(byte [] buf, int offset, int len) {
374     /*
375      * For faster hashing, use combinatorial generation
376      * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
377      */
378     int hash1 = this.hash.hash(buf, offset, len, 0);
379     int hash2 = this.hash.hash(buf, offset, len, hash1);
380 
381     for (int i = 0; i < this.hashCount; i++) {
382       long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
383       set(hashLoc);
384     }
385 
386     ++this.keyCount;
387   }
388 
389   /** Should only be used in tests */
390   boolean contains(byte [] buf) {
391     return contains(buf, 0, buf.length, this.bloom);
392   }
393 
394   /** Should only be used in tests */
395   boolean contains(byte [] buf, int offset, int length) {
396     return contains(buf, offset, length, bloom);
397   }
398 
399   /** Should only be used in tests */
400   boolean contains(byte[] buf, ByteBuffer bloom) {
401     return contains(buf, 0, buf.length, bloom);
402   }
403 
404   @Override
405   public boolean contains(byte[] buf, int offset, int length,
406       ByteBuffer theBloom) {
407     if (theBloom == null) {
408       // In a version 1 HFile Bloom filter data is stored in a separate meta
409       // block which is loaded on demand, but in version 2 it is pre-loaded.
410       // We want to use the same API in both cases.
411       theBloom = bloom;
412     }
413 
414     if (theBloom.limit() != byteSize) {
415       throw new IllegalArgumentException("Bloom does not match expected size:"
416           + " theBloom.limit()=" + theBloom.limit() + ", byteSize=" + byteSize);
417     }
418 
419     return contains(buf, offset, length, theBloom, 0, (int) byteSize, hash, hashCount);
420   }
421 
422   public static boolean contains(byte[] buf, int offset, int length,
423       ByteBuffer bloomBuf, int bloomOffset, int bloomSize, Hash hash,
424       int hashCount) {
425 
426     int hash1 = hash.hash(buf, offset, length, 0);
427     int bloomBitSize = bloomSize << 3;
428 
429     int hash2 = 0;
430     int compositeHash = 0;
431 
432     if (randomGeneratorForTest == null) {
433       // Production mode
434       compositeHash = hash1;
435       hash2 = hash.hash(buf, offset, length, hash1);
436     }
437 
438     for (int i = 0; i < hashCount; i++) {
439       int hashLoc = (randomGeneratorForTest == null
440           // Production mode
441           ? Math.abs(compositeHash % bloomBitSize)
442           // Test mode with "fake look-ups" to estimate "ideal false positive rate"
443           : randomGeneratorForTest.nextInt(bloomBitSize));
444       compositeHash += hash2;
445       if (!get(hashLoc, bloomBuf, bloomOffset)) {
446         return false;
447       }
448     }
449     return true;
450   }
451 
452   //---------------------------------------------------------------------------
453   /** Private helpers */
454 
455   /**
456    * Set the bit at the specified index to 1.
457    *
458    * @param pos index of bit
459    */
460   void set(long pos) {
461     int bytePos = (int)(pos / 8);
462     int bitPos = (int)(pos % 8);
463     byte curByte = bloom.get(bytePos);
464     curByte |= bitvals[bitPos];
465     bloom.put(bytePos, curByte);
466   }
467 
468   /**
469    * Check if bit at specified index is 1.
470    *
471    * @param pos index of bit
472    * @return true if bit at specified index is 1, false if 0.
473    */
474   static boolean get(int pos, ByteBuffer bloomBuf, int bloomOffset) {
475     int bytePos = pos >> 3; //pos / 8
476     int bitPos = pos & 0x7; //pos % 8
477     // TODO access this via Util API which can do Unsafe access if possible(?)
478     byte curByte = bloomBuf.get(bloomOffset + bytePos);
479     curByte &= bitvals[bitPos];
480     return (curByte != 0);
481   }
482 
483   @Override
484   public long getKeyCount() {
485     return keyCount;
486   }
487 
488   @Override
489   public long getMaxKeys() {
490     return maxKeys;
491   }
492 
493   @Override
494   public long getByteSize() {
495     return byteSize;
496   }
497 
498   public int getHashType() {
499     return hashType;
500   }
501 
502   @Override
503   public void compactBloom() {
504     // see if the actual size is exponentially smaller than expected.
505     if (this.keyCount > 0 && this.bloom.hasArray()) {
506       int pieces = 1;
507       int newByteSize = (int)this.byteSize;
508       int newMaxKeys = this.maxKeys;
509 
510       // while exponentially smaller & folding is lossless
511       while ( (newByteSize & 1) == 0 && newMaxKeys > (this.keyCount<<1) ) {
512         pieces <<= 1;
513         newByteSize >>= 1;
514         newMaxKeys >>= 1;
515       }
516 
517       // if we should fold these into pieces
518       if (pieces > 1) {
519         byte[] array = this.bloom.array();
520         int start = this.bloom.arrayOffset();
521         int end = start + newByteSize;
522         int off = end;
523         for(int p = 1; p < pieces; ++p) {
524           for(int pos = start; pos < end; ++pos) {
525             array[pos] |= array[off++];
526           }
527         }
528         // folding done, only use a subset of this array
529         this.bloom.rewind();
530         this.bloom.limit(newByteSize);
531         this.bloom = this.bloom.slice();
532         this.byteSize = newByteSize;
533         this.maxKeys = newMaxKeys;
534       }
535     }
536   }
537 
538 
539   //---------------------------------------------------------------------------
540 
541   /**
542    * Writes just the bloom filter to the output array
543    * @param out OutputStream to place bloom
544    * @throws IOException Error writing bloom array
545    */
546   public void writeBloom(final DataOutput out) throws IOException {
547     if (!this.bloom.hasArray()) {
548       throw new IOException("Only writes ByteBuffer with underlying array.");
549     }
550     out.write(bloom.array(), bloom.arrayOffset(), bloom.limit());
551   }
552 
553   @Override
554   public Writable getMetaWriter() {
555     return new MetaWriter();
556   }
557 
558   @Override
559   public Writable getDataWriter() {
560     return new DataWriter();
561   }
562 
563   private class MetaWriter implements Writable {
564     protected MetaWriter() {}
565     @Override
566     public void readFields(DataInput arg0) throws IOException {
567       throw new IOException("Cant read with this class.");
568     }
569 
570     @Override
571     public void write(DataOutput out) throws IOException {
572       out.writeInt(VERSION);
573       out.writeInt((int) byteSize);
574       out.writeInt(hashCount);
575       out.writeInt(hashType);
576       out.writeInt(keyCount);
577     }
578   }
579 
580   private class DataWriter implements Writable {
581     protected DataWriter() {}
582     @Override
583     public void readFields(DataInput arg0) throws IOException {
584       throw new IOException("Cant read with this class.");
585     }
586 
587     @Override
588     public void write(DataOutput out) throws IOException {
589       writeBloom(out);
590     }
591   }
592 
593   public int getHashCount() {
594     return hashCount;
595   }
596 
597   @Override
598   public boolean supportsAutoLoading() {
599     return bloom != null;
600   }
601 
602   /**
603    * Sets a random generator to be used for look-ups instead of computing hashes. Can be used to
604    * simulate uniformity of accesses better in a test environment. Should not be set in a real
605    * environment where correctness matters!
606    * <p>
607    * This gets used in {@link #contains(byte[], int, int, ByteBuffer, int, int, Hash, int)}
608    * @param random The random number source to use, or null to compute actual hashes
609    */
610   @VisibleForTesting
611   public static void setRandomGeneratorForTest(Random random) {
612     randomGeneratorForTest = random;
613   }
614 
615   /**
616    * {@inheritDoc}
617    * Just concatenate row and column by default. May return the original row
618    * buffer if the column qualifier is empty.
619    */
620   @Override
621   public byte[] createBloomKey(byte[] rowBuf, int rowOffset, int rowLen,
622       byte[] qualBuf, int qualOffset, int qualLen) {
623     // Optimize the frequent case when only the row is provided.
624     if (qualLen <= 0 && rowOffset == 0 && rowLen == rowBuf.length)
625       return rowBuf;
626 
627     byte [] result = new byte[rowLen + qualLen];
628     System.arraycopy(rowBuf, rowOffset, result, 0,  rowLen);
629     if (qualLen > 0)
630       System.arraycopy(qualBuf, qualOffset, result, rowLen, qualLen);
631     return result;
632   }
633 
634   @Override
635   public KVComparator getComparator() {
636 //    return Bytes.BYTES_RAWCOMPARATOR;
637     return KeyValue.RAW_COMPARATOR;
638   }
639 
640   /**
641    * A human-readable string with statistics for the given Bloom filter.
642    *
643    * @param bloomFilter the Bloom filter to output statistics for;
644    * @return a string consisting of "&lt;key&gt;: &lt;value&gt;" parts
645    *         separated by {@link #STATS_RECORD_SEP}.
646    */
647   public static String formatStats(BloomFilterBase bloomFilter) {
648     StringBuilder sb = new StringBuilder();
649     long k = bloomFilter.getKeyCount();
650     long m = bloomFilter.getMaxKeys();
651 
652     sb.append("BloomSize: " + bloomFilter.getByteSize() + STATS_RECORD_SEP);
653     sb.append("No of Keys in bloom: " + k + STATS_RECORD_SEP);
654     sb.append("Max Keys for bloom: " + m);
655     if (m > 0) {
656       sb.append(STATS_RECORD_SEP + "Percentage filled: "
657           + NumberFormat.getPercentInstance().format(k * 1.0 / m));
658     }
659     return sb.toString();
660   }
661 
662   @Override
663   public String toString() {
664     return formatStats(this) + STATS_RECORD_SEP + "Actual error rate: "
665         + String.format("%.8f", actualErrorRate());
666   }
667 
668 }