001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import java.io.DataInput;
021import java.io.DataOutput;
022import java.io.IOException;
023import java.nio.ByteBuffer;
024import org.apache.hadoop.hbase.Cell;
025import org.apache.hadoop.hbase.regionserver.BloomType;
026import org.apache.yetus.audience.InterfaceAudience;
027
028/**
029 * The basic building block for the {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilter}
030 */
031@InterfaceAudience.Private
032public class BloomFilterChunk implements BloomFilterBase {
033
034  /** Bytes (B) in the array. This actually has to fit into an int. */
035  protected long byteSize;
036  /** Number of hash functions */
037  protected int hashCount;
038  /** Hash type */
039  protected final int hashType;
040  /** Hash Function */
041  protected final Hash hash;
042  /** Keys currently in the bloom */
043  protected int keyCount;
044  /** Max Keys expected for the bloom */
045  protected int maxKeys;
046  /** Bloom bits */
047  protected ByteBuffer bloom;
048  /** The type of bloom */
049  protected BloomType bloomType;
050
051  /**
052   * Loads bloom filter meta data from file input.
053   * @param meta stored bloom meta data
054   * @throws IllegalArgumentException meta data is invalid
055   */
056  public BloomFilterChunk(DataInput meta) throws IOException, IllegalArgumentException {
057    this.byteSize = meta.readInt();
058    this.hashCount = meta.readInt();
059    this.hashType = meta.readInt();
060    this.keyCount = meta.readInt();
061    this.maxKeys = this.keyCount;
062
063    this.hash = Hash.getInstance(this.hashType);
064    if (hash == null) {
065      throw new IllegalArgumentException("Invalid hash type: " + hashType);
066    }
067    sanityCheck();
068  }
069
070  /**
071   * Computes the error rate for this Bloom filter, taking into account the actual number of hash
072   * functions and keys inserted. The return value of this function changes as a Bloom filter is
073   * being populated. Used for reporting the actual error rate of compound Bloom filters when
074   * writing them out.
075   * @return error rate for this particular Bloom filter
076   */
077  public double actualErrorRate() {
078    return BloomFilterUtil.actualErrorRate(keyCount, byteSize * 8, hashCount);
079  }
080
081  public BloomFilterChunk(int hashType, BloomType bloomType) {
082    this.hashType = hashType;
083    this.hash = Hash.getInstance(hashType);
084    this.bloomType = bloomType;
085  }
086
087  /**
088   * Determines & initializes bloom filter meta data from user config. Call
089   * {@link #allocBloom()} to allocate bloom filter data.
090   * @param maxKeys    Maximum expected number of keys that will be stored in this bloom
091   * @param errorRate  Desired false positive error rate. Lower rate = more storage required
092   * @param hashType   Type of hash function to use
093   * @param foldFactor When finished adding entries, you may be able to 'fold' this bloom to save
094   *                   space. Tradeoff potentially excess bytes in bloom for ability to fold if
095   *                   keyCount is exponentially greater than maxKeys.
096   */
097  // Used only in testcases
098  public BloomFilterChunk(int maxKeys, double errorRate, int hashType, int foldFactor)
099    throws IllegalArgumentException {
100    this(hashType, BloomType.ROW);
101
102    long bitSize = BloomFilterUtil.computeBitSize(maxKeys, errorRate);
103    hashCount = BloomFilterUtil.optimalFunctionCount(maxKeys, bitSize);
104    this.maxKeys = maxKeys;
105
106    // increase byteSize so folding is possible
107    byteSize = BloomFilterUtil.computeFoldableByteSize(bitSize, foldFactor);
108
109    sanityCheck();
110  }
111
112  /**
113   * Creates another similar Bloom filter. Does not copy the actual bits, and sets the new filter's
114   * key count to zero.
115   * @return a Bloom filter with the same configuration as this
116   */
117  public BloomFilterChunk createAnother() {
118    BloomFilterChunk bbf = new BloomFilterChunk(hashType, this.bloomType);
119    bbf.byteSize = byteSize;
120    bbf.hashCount = hashCount;
121    bbf.maxKeys = maxKeys;
122    return bbf;
123  }
124
125  public void allocBloom() {
126    if (this.bloom != null) {
127      throw new IllegalArgumentException("can only create bloom once.");
128    }
129    this.bloom = ByteBuffer.allocate((int) this.byteSize);
130    assert this.bloom.hasArray();
131  }
132
133  void sanityCheck() throws IllegalArgumentException {
134    if (0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) {
135      throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize);
136    }
137
138    if (this.hashCount <= 0) {
139      throw new IllegalArgumentException("Hash function count must be > 0");
140    }
141
142    if (this.hash == null) {
143      throw new IllegalArgumentException("hashType must be known");
144    }
145
146    if (this.keyCount < 0) {
147      throw new IllegalArgumentException("must have positive keyCount");
148    }
149  }
150
151  void bloomCheck(ByteBuffer bloom) throws IllegalArgumentException {
152    if (this.byteSize != bloom.limit()) {
153      throw new IllegalArgumentException("Configured bloom length should match actual length");
154    }
155  }
156
157  // Used only by tests
158  void add(byte[] buf, int offset, int len) {
159    /*
160     * For faster hashing, use combinatorial generation
161     * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
162     */
163    HashKey<byte[]> hashKey = new ByteArrayHashKey(buf, offset, len);
164    int hash1 = this.hash.hash(hashKey, 0);
165    int hash2 = this.hash.hash(hashKey, hash1);
166    setHashLoc(hash1, hash2);
167  }
168
169  public void add(Cell cell) {
170    /*
171     * For faster hashing, use combinatorial generation
172     * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
173     */
174    int hash1;
175    int hash2;
176    HashKey<Cell> hashKey;
177    if (this.bloomType == BloomType.ROWCOL) {
178      hashKey = new RowColBloomHashKey(cell);
179      hash1 = this.hash.hash(hashKey, 0);
180      hash2 = this.hash.hash(hashKey, hash1);
181    } else {
182      hashKey = new RowBloomHashKey(cell);
183      hash1 = this.hash.hash(hashKey, 0);
184      hash2 = this.hash.hash(hashKey, hash1);
185    }
186    setHashLoc(hash1, hash2);
187  }
188
189  private void setHashLoc(int hash1, int hash2) {
190    for (int i = 0; i < this.hashCount; i++) {
191      long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
192      set(hashLoc);
193    }
194
195    ++this.keyCount;
196  }
197
198  // ---------------------------------------------------------------------------
199  /** Private helpers */
200
201  /**
202   * Set the bit at the specified index to 1.
203   * @param pos index of bit
204   */
205  void set(long pos) {
206    int bytePos = (int) (pos / 8);
207    int bitPos = (int) (pos % 8);
208    byte curByte = bloom.get(bytePos);
209    curByte |= BloomFilterUtil.bitvals[bitPos];
210    bloom.put(bytePos, curByte);
211  }
212
213  /**
214   * Check if bit at specified index is 1.
215   * @param pos index of bit
216   * @return true if bit at specified index is 1, false if 0.
217   */
218  static boolean get(int pos, ByteBuffer bloomBuf, int bloomOffset) {
219    int bytePos = pos >> 3; // pos / 8
220    int bitPos = pos & 0x7; // pos % 8
221    // TODO access this via Util API which can do Unsafe access if possible(?)
222    byte curByte = bloomBuf.get(bloomOffset + bytePos);
223    curByte &= BloomFilterUtil.bitvals[bitPos];
224    return (curByte != 0);
225  }
226
227  @Override
228  public long getKeyCount() {
229    return keyCount;
230  }
231
232  @Override
233  public long getMaxKeys() {
234    return maxKeys;
235  }
236
237  @Override
238  public long getByteSize() {
239    return byteSize;
240  }
241
242  public int getHashType() {
243    return hashType;
244  }
245
246  public void compactBloom() {
247    // see if the actual size is exponentially smaller than expected.
248    if (this.keyCount > 0 && this.bloom.hasArray()) {
249      int pieces = 1;
250      int newByteSize = (int) this.byteSize;
251      int newMaxKeys = this.maxKeys;
252
253      // while exponentially smaller & folding is lossless
254      while ((newByteSize & 1) == 0 && newMaxKeys > (this.keyCount << 1)) {
255        pieces <<= 1;
256        newByteSize >>= 1;
257        newMaxKeys >>= 1;
258      }
259
260      // if we should fold these into pieces
261      if (pieces > 1) {
262        byte[] array = this.bloom.array();
263        int start = this.bloom.arrayOffset();
264        int end = start + newByteSize;
265        int off = end;
266        for (int p = 1; p < pieces; ++p) {
267          for (int pos = start; pos < end; ++pos) {
268            array[pos] |= array[off++];
269          }
270        }
271        // folding done, only use a subset of this array
272        this.bloom.rewind();
273        this.bloom.limit(newByteSize);
274        this.bloom = this.bloom.slice();
275        this.byteSize = newByteSize;
276        this.maxKeys = newMaxKeys;
277      }
278    }
279  }
280
281  /**
282   * Writes just the bloom filter to the output array
283   * @param out OutputStream to place bloom
284   * @throws IOException Error writing bloom array
285   */
286  public void writeBloom(final DataOutput out) throws IOException {
287    if (!this.bloom.hasArray()) {
288      throw new IOException("Only writes ByteBuffer with underlying array.");
289    }
290    out.write(this.bloom.array(), this.bloom.arrayOffset(), this.bloom.limit());
291  }
292
293  public int getHashCount() {
294    return hashCount;
295  }
296
297  @Override
298  public String toString() {
299    return BloomFilterUtil.toString(this);
300  }
301
302}