View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import java.nio.ByteBuffer;
22  
23  import org.apache.hadoop.hbase.Cell;
24  import org.apache.hadoop.hbase.KeyValue;
25  import org.apache.hadoop.hbase.classification.InterfaceAudience;
26  
27  /**
28   *
29   * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
30   * <p>
31   * The Bloom filter is a data structure that was introduced in 1970 and that has
32   * been adopted by the networking research community in the past decade thanks
33   * to the bandwidth efficiencies that it offers for the transmission of set
34   * membership information between networked hosts. A sender encodes the
35   * information into a bit vector, the Bloom filter, that is more compact than a
36   * conventional representation. Computation and space costs for construction are
37   * linear in the number of elements. The receiver uses the filter to test
38   * whether various elements are members of the set. Though the filter will
39   * occasionally return a false positive, it will never return a false negative.
40   * When creating the filter, the sender can choose its desired point in a
41   * trade-off between the false positive rate and the size.
42   *
43   * <p>
44   * Originally inspired by <a href="http://www.one-lab.org">European Commission
45   * One-Lab Project 034819</a>.
46   *
47   * Bloom filters are very sensitive to the number of elements inserted into
48   * them. For HBase, the number of entries depends on the size of the data stored
49   * in the column. Currently the default region size is 256MB, so entry count ~=
50   * 256MB / (average value size for column). Despite this rule of thumb, there is
51   * no efficient way to calculate the entry count after compactions. Therefore,
52   * it is often easier to use a dynamic bloom filter that will add extra space
53   * instead of allowing the error rate to grow.
54   *
55   * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey
56   * .pdf )
57   *
58   * m denotes the number of bits in the Bloom filter (bitSize) n denotes the
59   * number of elements inserted into the Bloom filter (maxKeys) k represents the
60   * number of hash functions used (nbHash) e represents the desired false
61   * positive rate for the bloom (err)
62   *
63   * If we fix the error rate (e) and know the number of entries, then the optimal
64   * bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / ln(0.6185)
65   *
66   * The probability of false positives is minimized when k = m/n ln(2).
67   *
68   * @see BloomFilter The general behavior of a filter
69   *
70   * @see <a
71   *      href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">
72   *      Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
73   *
74   * @see BloomFilterWriter for the ability to add elements to a Bloom filter
75   */
76  @InterfaceAudience.Private
77  public interface BloomFilter extends BloomFilterBase {
78  
79    /**
80     * Check if the specified key is contained in the bloom filter.
81     * Used in ROW_COL blooms where the blooms are serialized as KeyValues
82     * @param keyCell the key to check for the existence of
83     * @param bloom bloom filter data to search. This can be null if auto-loading
84     *        is supported.
85     * @return true if matched by bloom, false if not
86     */
87    boolean contains(Cell keyCell, ByteBuffer bloom);
88  
89    /**
90     * Check if the specified key is contained in the bloom filter.
91     * Used in ROW bloom where the blooms are just plain byte[]
92     * @param buf data to check for existence of
93     * @param offset offset into the data
94     * @param length length of the data
95     * @param bloom bloom filter data to search. This can be null if auto-loading
96     *        is supported.
97     * @return true if matched by bloom, false if not
98     */
99    boolean contains(byte[] buf, int offset, int length, ByteBuffer bloom);
100 
101   /**
102    * @return true if this Bloom filter can automatically load its data
103    *         and thus allows a null byte buffer to be passed to contains()
104    */
105   boolean supportsAutoLoading();
106 }