001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.util; 019 020import java.io.DataInput; 021import java.io.DataOutput; 022import java.io.IOException; 023import java.nio.ByteBuffer; 024import org.apache.hadoop.hbase.Cell; 025import org.apache.hadoop.hbase.regionserver.BloomType; 026import org.apache.yetus.audience.InterfaceAudience; 027 028/** 029 * The basic building block for the {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilter} 030 */ 031@InterfaceAudience.Private 032public class BloomFilterChunk implements BloomFilterBase { 033 034 /** Bytes (B) in the array. This actually has to fit into an int. */ 035 protected long byteSize; 036 /** Number of hash functions */ 037 protected int hashCount; 038 /** Hash type */ 039 protected final int hashType; 040 /** Hash Function */ 041 protected final Hash hash; 042 /** Keys currently in the bloom */ 043 protected int keyCount; 044 /** Max Keys expected for the bloom */ 045 protected int maxKeys; 046 /** Bloom bits */ 047 protected ByteBuffer bloom; 048 /** The type of bloom */ 049 protected BloomType bloomType; 050 051 /** 052 * Loads bloom filter meta data from file input. 053 * @param meta stored bloom meta data 054 * @throws IllegalArgumentException meta data is invalid 055 */ 056 public BloomFilterChunk(DataInput meta) throws IOException, IllegalArgumentException { 057 this.byteSize = meta.readInt(); 058 this.hashCount = meta.readInt(); 059 this.hashType = meta.readInt(); 060 this.keyCount = meta.readInt(); 061 this.maxKeys = this.keyCount; 062 063 this.hash = Hash.getInstance(this.hashType); 064 if (hash == null) { 065 throw new IllegalArgumentException("Invalid hash type: " + hashType); 066 } 067 sanityCheck(); 068 } 069 070 /** 071 * Computes the error rate for this Bloom filter, taking into account the actual number of hash 072 * functions and keys inserted. The return value of this function changes as a Bloom filter is 073 * being populated. Used for reporting the actual error rate of compound Bloom filters when 074 * writing them out. 075 * @return error rate for this particular Bloom filter 076 */ 077 public double actualErrorRate() { 078 return BloomFilterUtil.actualErrorRate(keyCount, byteSize * 8, hashCount); 079 } 080 081 public BloomFilterChunk(int hashType, BloomType bloomType) { 082 this.hashType = hashType; 083 this.hash = Hash.getInstance(hashType); 084 this.bloomType = bloomType; 085 } 086 087 /** 088 * Determines & initializes bloom filter meta data from user config. Call 089 * {@link #allocBloom()} to allocate bloom filter data. 090 * @param maxKeys Maximum expected number of keys that will be stored in this bloom 091 * @param errorRate Desired false positive error rate. Lower rate = more storage required 092 * @param hashType Type of hash function to use 093 * @param foldFactor When finished adding entries, you may be able to 'fold' this bloom to save 094 * space. Tradeoff potentially excess bytes in bloom for ability to fold if 095 * keyCount is exponentially greater than maxKeys. 096 */ 097 // Used only in testcases 098 public BloomFilterChunk(int maxKeys, double errorRate, int hashType, int foldFactor) 099 throws IllegalArgumentException { 100 this(hashType, BloomType.ROW); 101 102 long bitSize = BloomFilterUtil.computeBitSize(maxKeys, errorRate); 103 hashCount = BloomFilterUtil.optimalFunctionCount(maxKeys, bitSize); 104 this.maxKeys = maxKeys; 105 106 // increase byteSize so folding is possible 107 byteSize = BloomFilterUtil.computeFoldableByteSize(bitSize, foldFactor); 108 109 sanityCheck(); 110 } 111 112 /** 113 * Creates another similar Bloom filter. Does not copy the actual bits, and sets the new filter's 114 * key count to zero. 115 * @return a Bloom filter with the same configuration as this 116 */ 117 public BloomFilterChunk createAnother() { 118 BloomFilterChunk bbf = new BloomFilterChunk(hashType, this.bloomType); 119 bbf.byteSize = byteSize; 120 bbf.hashCount = hashCount; 121 bbf.maxKeys = maxKeys; 122 return bbf; 123 } 124 125 public void allocBloom() { 126 if (this.bloom != null) { 127 throw new IllegalArgumentException("can only create bloom once."); 128 } 129 this.bloom = ByteBuffer.allocate((int) this.byteSize); 130 assert this.bloom.hasArray(); 131 } 132 133 void sanityCheck() throws IllegalArgumentException { 134 if (0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) { 135 throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize); 136 } 137 138 if (this.hashCount <= 0) { 139 throw new IllegalArgumentException("Hash function count must be > 0"); 140 } 141 142 if (this.hash == null) { 143 throw new IllegalArgumentException("hashType must be known"); 144 } 145 146 if (this.keyCount < 0) { 147 throw new IllegalArgumentException("must have positive keyCount"); 148 } 149 } 150 151 void bloomCheck(ByteBuffer bloom) throws IllegalArgumentException { 152 if (this.byteSize != bloom.limit()) { 153 throw new IllegalArgumentException("Configured bloom length should match actual length"); 154 } 155 } 156 157 // Used only by tests 158 void add(byte[] buf, int offset, int len) { 159 /* 160 * For faster hashing, use combinatorial generation 161 * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf 162 */ 163 HashKey<byte[]> hashKey = new ByteArrayHashKey(buf, offset, len); 164 int hash1 = this.hash.hash(hashKey, 0); 165 int hash2 = this.hash.hash(hashKey, hash1); 166 setHashLoc(hash1, hash2); 167 } 168 169 public void add(Cell cell) { 170 /* 171 * For faster hashing, use combinatorial generation 172 * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf 173 */ 174 int hash1; 175 int hash2; 176 HashKey<Cell> hashKey; 177 if (this.bloomType == BloomType.ROWCOL) { 178 hashKey = new RowColBloomHashKey(cell); 179 hash1 = this.hash.hash(hashKey, 0); 180 hash2 = this.hash.hash(hashKey, hash1); 181 } else { 182 hashKey = new RowBloomHashKey(cell); 183 hash1 = this.hash.hash(hashKey, 0); 184 hash2 = this.hash.hash(hashKey, hash1); 185 } 186 setHashLoc(hash1, hash2); 187 } 188 189 private void setHashLoc(int hash1, int hash2) { 190 for (int i = 0; i < this.hashCount; i++) { 191 long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8)); 192 set(hashLoc); 193 } 194 195 ++this.keyCount; 196 } 197 198 // --------------------------------------------------------------------------- 199 /** Private helpers */ 200 201 /** 202 * Set the bit at the specified index to 1. 203 * @param pos index of bit 204 */ 205 void set(long pos) { 206 int bytePos = (int) (pos / 8); 207 int bitPos = (int) (pos % 8); 208 byte curByte = bloom.get(bytePos); 209 curByte |= BloomFilterUtil.bitvals[bitPos]; 210 bloom.put(bytePos, curByte); 211 } 212 213 /** 214 * Check if bit at specified index is 1. 215 * @param pos index of bit 216 * @return true if bit at specified index is 1, false if 0. 217 */ 218 static boolean get(int pos, ByteBuffer bloomBuf, int bloomOffset) { 219 int bytePos = pos >> 3; // pos / 8 220 int bitPos = pos & 0x7; // pos % 8 221 // TODO access this via Util API which can do Unsafe access if possible(?) 222 byte curByte = bloomBuf.get(bloomOffset + bytePos); 223 curByte &= BloomFilterUtil.bitvals[bitPos]; 224 return (curByte != 0); 225 } 226 227 @Override 228 public long getKeyCount() { 229 return keyCount; 230 } 231 232 @Override 233 public long getMaxKeys() { 234 return maxKeys; 235 } 236 237 @Override 238 public long getByteSize() { 239 return byteSize; 240 } 241 242 public int getHashType() { 243 return hashType; 244 } 245 246 public void compactBloom() { 247 // see if the actual size is exponentially smaller than expected. 248 if (this.keyCount > 0 && this.bloom.hasArray()) { 249 int pieces = 1; 250 int newByteSize = (int) this.byteSize; 251 int newMaxKeys = this.maxKeys; 252 253 // while exponentially smaller & folding is lossless 254 while ((newByteSize & 1) == 0 && newMaxKeys > (this.keyCount << 1)) { 255 pieces <<= 1; 256 newByteSize >>= 1; 257 newMaxKeys >>= 1; 258 } 259 260 // if we should fold these into pieces 261 if (pieces > 1) { 262 byte[] array = this.bloom.array(); 263 int start = this.bloom.arrayOffset(); 264 int end = start + newByteSize; 265 int off = end; 266 for (int p = 1; p < pieces; ++p) { 267 for (int pos = start; pos < end; ++pos) { 268 array[pos] |= array[off++]; 269 } 270 } 271 // folding done, only use a subset of this array 272 this.bloom.rewind(); 273 this.bloom.limit(newByteSize); 274 this.bloom = this.bloom.slice(); 275 this.byteSize = newByteSize; 276 this.maxKeys = newMaxKeys; 277 } 278 } 279 } 280 281 /** 282 * Writes just the bloom filter to the output array 283 * @param out OutputStream to place bloom 284 * @throws IOException Error writing bloom array 285 */ 286 public void writeBloom(final DataOutput out) throws IOException { 287 if (!this.bloom.hasArray()) { 288 throw new IOException("Only writes ByteBuffer with underlying array."); 289 } 290 out.write(this.bloom.array(), this.bloom.arrayOffset(), this.bloom.limit()); 291 } 292 293 public int getHashCount() { 294 return hashCount; 295 } 296 297 @Override 298 public String toString() { 299 return BloomFilterUtil.toString(this); 300 } 301 302}