001/*
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.util;
020
021
022import org.apache.hadoop.hbase.Cell;
023import org.apache.yetus.audience.InterfaceAudience;
024import org.apache.hadoop.hbase.nio.ByteBuff;
025import org.apache.hadoop.hbase.regionserver.BloomType;
026
027/**
028 *
029 * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
030 * <p>
031 * The Bloom filter is a data structure that was introduced in 1970 and that has
032 * been adopted by the networking research community in the past decade thanks
033 * to the bandwidth efficiencies that it offers for the transmission of set
034 * membership information between networked hosts. A sender encodes the
035 * information into a bit vector, the Bloom filter, that is more compact than a
036 * conventional representation. Computation and space costs for construction are
037 * linear in the number of elements. The receiver uses the filter to test
038 * whether various elements are members of the set. Though the filter will
039 * occasionally return a false positive, it will never return a false negative.
040 * When creating the filter, the sender can choose its desired point in a
041 * trade-off between the false positive rate and the size.
042 *
043 * <p>
044 * Originally inspired by <a href="http://www.one-lab.org/">European Commission
045 * One-Lab Project 034819</a>.
046 *
047 * Bloom filters are very sensitive to the number of elements inserted into
048 * them. For HBase, the number of entries depends on the size of the data stored
049 * in the column. Currently the default region size is 256MB, so entry count ~=
050 * 256MB / (average value size for column). Despite this rule of thumb, there is
051 * no efficient way to calculate the entry count after compactions. Therefore,
052 * it is often easier to use a dynamic bloom filter that will add extra space
053 * instead of allowing the error rate to grow.
054 *
055 * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey
056 * .pdf )
057 *
058 * m denotes the number of bits in the Bloom filter (bitSize) n denotes the
059 * number of elements inserted into the Bloom filter (maxKeys) k represents the
060 * number of hash functions used (nbHash) e represents the desired false
061 * positive rate for the bloom (err)
062 *
063 * If we fix the error rate (e) and know the number of entries, then the optimal
064 * bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / ln(0.6185)
065 *
066 * The probability of false positives is minimized when k = m/n ln(2).
067 *
068 * @see BloomFilter The general behavior of a filter
069 *
070 * @see <a
071 *      href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">
072 *      Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
073 *
074 * @see BloomFilterWriter for the ability to add elements to a Bloom filter
075 */
076@InterfaceAudience.Private
077public interface BloomFilter extends BloomFilterBase {
078
079  /**
080   * Check if the specified key is contained in the bloom filter.
081   * @param keyCell the key to check for the existence of
082   * @param bloom bloom filter data to search. This can be null if auto-loading
083   *        is supported.
084   * @param type The type of Bloom ROW/ ROW_COL
085   * @return true if matched by bloom, false if not
086   */
087  boolean contains(Cell keyCell, ByteBuff bloom, BloomType type);
088
089  /**
090   * Check if the specified key is contained in the bloom filter.
091   * @param buf data to check for existence of
092   * @param offset offset into the data
093   * @param length length of the data
094   * @param bloom bloom filter data to search. This can be null if auto-loading
095   *        is supported.
096   * @return true if matched by bloom, false if not
097   */
098  boolean contains(byte[] buf, int offset, int length, ByteBuff bloom);
099
100  /**
101   * @return true if this Bloom filter can automatically load its data
102   *         and thus allows a null byte buffer to be passed to contains()
103   */
104  boolean supportsAutoLoading();
105}