001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertFalse;
022import static org.junit.Assert.assertTrue;
023
024import java.io.ByteArrayOutputStream;
025import java.io.DataOutputStream;
026import java.nio.ByteBuffer;
027import org.apache.hadoop.hbase.HBaseClassTestRule;
028import org.apache.hadoop.hbase.nio.MultiByteBuff;
029import org.apache.hadoop.hbase.testclassification.MiscTests;
030import org.apache.hadoop.hbase.testclassification.SmallTests;
031import org.junit.ClassRule;
032import org.junit.Test;
033import org.junit.experimental.categories.Category;
034
035@Category({ MiscTests.class, SmallTests.class })
036public class TestBloomFilterChunk {
037
038  @ClassRule
039  public static final HBaseClassTestRule CLASS_RULE =
040    HBaseClassTestRule.forClass(TestBloomFilterChunk.class);
041
042  @Test
043  public void testBasicBloom() throws Exception {
044    BloomFilterChunk bf1 = new BloomFilterChunk(1000, (float) 0.01, Hash.MURMUR_HASH, 0);
045    BloomFilterChunk bf2 = new BloomFilterChunk(1000, (float) 0.01, Hash.MURMUR_HASH, 0);
046    bf1.allocBloom();
047    bf2.allocBloom();
048
049    // test 1: verify no fundamental false negatives or positives
050    byte[] key1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
051    byte[] key2 = { 1, 2, 3, 4, 5, 6, 7, 8, 7 };
052
053    bf1.add(key1, 0, key1.length);
054    bf2.add(key2, 0, key2.length);
055
056    assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf1.bloom), 0,
057      (int) bf1.byteSize, bf1.hash, bf1.hashCount));
058    assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf1.bloom), 0,
059      (int) bf1.byteSize, bf1.hash, bf1.hashCount));
060    assertFalse(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf2.bloom), 0,
061      (int) bf2.byteSize, bf2.hash, bf2.hashCount));
062    assertTrue(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf2.bloom), 0,
063      (int) bf2.byteSize, bf2.hash, bf2.hashCount));
064
065    byte[] bkey = { 1, 2, 3, 4 };
066    byte[] bval = Bytes.toBytes("this is a much larger byte array");
067
068    bf1.add(bkey, 0, bkey.length);
069    bf1.add(bval, 1, bval.length - 1);
070
071    assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bf1.bloom), 0,
072      (int) bf1.byteSize, bf1.hash, bf1.hashCount));
073    assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bf1.bloom), 0,
074      (int) bf1.byteSize, bf1.hash, bf1.hashCount));
075    assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bf1.bloom), 0,
076      (int) bf1.byteSize, bf1.hash, bf1.hashCount));
077
078    // test 2: serialization & deserialization.
079    // (convert bloom to byte array & read byte array back in as input)
080    ByteArrayOutputStream bOut = new ByteArrayOutputStream();
081    bf1.writeBloom(new DataOutputStream(bOut));
082    ByteBuffer bb = ByteBuffer.wrap(bOut.toByteArray());
083    BloomFilterChunk newBf1 = new BloomFilterChunk(1000, (float) 0.01, Hash.MURMUR_HASH, 0);
084    assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bb), 0,
085      (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
086    assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bb), 0,
087      (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
088    assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bb), 0,
089      (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
090    assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bb), 0,
091      (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
092    assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0,
093      (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
094    assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0,
095      (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
096
097    System.out.println("Serialized as " + bOut.size() + " bytes");
098    assertTrue(bOut.size() - bf1.byteSize < 10); // ... allow small padding
099  }
100
101  @Test
102  public void testBloomFold() throws Exception {
103    // test: foldFactor < log(max/actual)
104    BloomFilterChunk b = new BloomFilterChunk(1003, (float) 0.01, Hash.MURMUR_HASH, 2);
105    b.allocBloom();
106    long origSize = b.getByteSize();
107    assertEquals(1204, origSize);
108    for (int i = 0; i < 12; ++i) {
109      byte[] ib = Bytes.toBytes(i);
110      b.add(ib, 0, ib.length);
111    }
112    b.compactBloom();
113    assertEquals(origSize >> 2, b.getByteSize());
114    int falsePositives = 0;
115    for (int i = 0; i < 25; ++i) {
116      byte[] bytes = Bytes.toBytes(i);
117      if (
118        BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0,
119          (int) b.byteSize, b.hash, b.hashCount)
120      ) {
121        if (i >= 12) falsePositives++;
122      } else {
123        assertFalse(i < 12);
124      }
125    }
126    assertTrue(falsePositives <= 1);
127
128    // test: foldFactor > log(max/actual)
129  }
130
131  @Test
132  public void testBloomPerf() throws Exception {
133    // add
134    float err = (float) 0.01;
135    BloomFilterChunk b = new BloomFilterChunk(10 * 1000 * 1000, (float) err, Hash.MURMUR_HASH, 3);
136    b.allocBloom();
137    long startTime = EnvironmentEdgeManager.currentTime();
138    long origSize = b.getByteSize();
139    for (int i = 0; i < 1 * 1000 * 1000; ++i) {
140      byte[] ib = Bytes.toBytes(i);
141      b.add(ib, 0, ib.length);
142    }
143    long endTime = EnvironmentEdgeManager.currentTime();
144    System.out.println("Total Add time = " + (endTime - startTime) + "ms");
145
146    // fold
147    startTime = EnvironmentEdgeManager.currentTime();
148    b.compactBloom();
149    endTime = EnvironmentEdgeManager.currentTime();
150    System.out.println("Total Fold time = " + (endTime - startTime) + "ms");
151    assertTrue(origSize >= b.getByteSize() << 3);
152
153    // test
154    startTime = EnvironmentEdgeManager.currentTime();
155    int falsePositives = 0;
156    for (int i = 0; i < 2 * 1000 * 1000; ++i) {
157
158      byte[] bytes = Bytes.toBytes(i);
159      if (
160        BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0,
161          (int) b.byteSize, b.hash, b.hashCount)
162      ) {
163        if (i >= 1 * 1000 * 1000) falsePositives++;
164      } else {
165        assertFalse(i < 1 * 1000 * 1000);
166      }
167    }
168    endTime = EnvironmentEdgeManager.currentTime();
169    System.out.println("Total Contains time = " + (endTime - startTime) + "ms");
170    System.out.println("False Positive = " + falsePositives);
171    assertTrue(falsePositives <= (1 * 1000 * 1000) * err);
172
173    // test: foldFactor > log(max/actual)
174  }
175
176  @Test
177  public void testSizing() {
178    int bitSize = 8 * 128 * 1024; // 128 KB
179    double errorRate = 0.025; // target false positive rate
180
181    // How many keys can we store in a Bloom filter of this size maintaining
182    // the given false positive rate, not taking into account that the n
183    long maxKeys = BloomFilterUtil.idealMaxKeys(bitSize, errorRate);
184    assertEquals(136570, maxKeys);
185
186    // A reverse operation: how many bits would we need to store this many keys
187    // and keep the same low false positive rate?
188    long bitSize2 = BloomFilterUtil.computeBitSize(maxKeys, errorRate);
189
190    // The bit size comes out a little different due to rounding.
191    assertTrue(Math.abs(bitSize2 - bitSize) * 1.0 / bitSize < 1e-5);
192  }
193
194  @Test
195  public void testFoldableByteSize() {
196    assertEquals(128, BloomFilterUtil.computeFoldableByteSize(1000, 5));
197    assertEquals(640, BloomFilterUtil.computeFoldableByteSize(5001, 4));
198  }
199}