001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.util; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertFalse; 022import static org.junit.Assert.assertTrue; 023 024import java.io.ByteArrayOutputStream; 025import java.io.DataOutputStream; 026import java.nio.ByteBuffer; 027import org.apache.hadoop.hbase.HBaseClassTestRule; 028import org.apache.hadoop.hbase.nio.MultiByteBuff; 029import org.apache.hadoop.hbase.testclassification.MiscTests; 030import org.apache.hadoop.hbase.testclassification.SmallTests; 031import org.junit.ClassRule; 032import org.junit.Test; 033import org.junit.experimental.categories.Category; 034 035@Category({ MiscTests.class, SmallTests.class }) 036public class TestBloomFilterChunk { 037 038 @ClassRule 039 public static final HBaseClassTestRule CLASS_RULE = 040 HBaseClassTestRule.forClass(TestBloomFilterChunk.class); 041 042 @Test 043 public void testBasicBloom() throws Exception { 044 BloomFilterChunk bf1 = new BloomFilterChunk(1000, (float) 0.01, Hash.MURMUR_HASH, 0); 045 BloomFilterChunk bf2 = new BloomFilterChunk(1000, (float) 0.01, Hash.MURMUR_HASH, 0); 046 bf1.allocBloom(); 047 bf2.allocBloom(); 048 049 // test 1: verify no fundamental false negatives or positives 050 byte[] key1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; 051 byte[] key2 = { 1, 2, 3, 4, 5, 6, 7, 8, 7 }; 052 053 bf1.add(key1, 0, key1.length); 054 bf2.add(key2, 0, key2.length); 055 056 assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf1.bloom), 0, 057 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 058 assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf1.bloom), 0, 059 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 060 assertFalse(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf2.bloom), 0, 061 (int) bf2.byteSize, bf2.hash, bf2.hashCount)); 062 assertTrue(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf2.bloom), 0, 063 (int) bf2.byteSize, bf2.hash, bf2.hashCount)); 064 065 byte[] bkey = { 1, 2, 3, 4 }; 066 byte[] bval = Bytes.toBytes("this is a much larger byte array"); 067 068 bf1.add(bkey, 0, bkey.length); 069 bf1.add(bval, 1, bval.length - 1); 070 071 assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bf1.bloom), 0, 072 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 073 assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bf1.bloom), 0, 074 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 075 assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bf1.bloom), 0, 076 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 077 078 // test 2: serialization & deserialization. 079 // (convert bloom to byte array & read byte array back in as input) 080 ByteArrayOutputStream bOut = new ByteArrayOutputStream(); 081 bf1.writeBloom(new DataOutputStream(bOut)); 082 ByteBuffer bb = ByteBuffer.wrap(bOut.toByteArray()); 083 BloomFilterChunk newBf1 = new BloomFilterChunk(1000, (float) 0.01, Hash.MURMUR_HASH, 0); 084 assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bb), 0, 085 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 086 assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bb), 0, 087 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 088 assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bb), 0, 089 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 090 assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bb), 0, 091 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 092 assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0, 093 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 094 assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0, 095 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 096 097 System.out.println("Serialized as " + bOut.size() + " bytes"); 098 assertTrue(bOut.size() - bf1.byteSize < 10); // ... allow small padding 099 } 100 101 @Test 102 public void testBloomFold() throws Exception { 103 // test: foldFactor < log(max/actual) 104 BloomFilterChunk b = new BloomFilterChunk(1003, (float) 0.01, Hash.MURMUR_HASH, 2); 105 b.allocBloom(); 106 long origSize = b.getByteSize(); 107 assertEquals(1204, origSize); 108 for (int i = 0; i < 12; ++i) { 109 byte[] ib = Bytes.toBytes(i); 110 b.add(ib, 0, ib.length); 111 } 112 b.compactBloom(); 113 assertEquals(origSize >> 2, b.getByteSize()); 114 int falsePositives = 0; 115 for (int i = 0; i < 25; ++i) { 116 byte[] bytes = Bytes.toBytes(i); 117 if ( 118 BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0, 119 (int) b.byteSize, b.hash, b.hashCount) 120 ) { 121 if (i >= 12) falsePositives++; 122 } else { 123 assertFalse(i < 12); 124 } 125 } 126 assertTrue(falsePositives <= 1); 127 128 // test: foldFactor > log(max/actual) 129 } 130 131 @Test 132 public void testBloomPerf() throws Exception { 133 // add 134 float err = (float) 0.01; 135 BloomFilterChunk b = new BloomFilterChunk(10 * 1000 * 1000, (float) err, Hash.MURMUR_HASH, 3); 136 b.allocBloom(); 137 long startTime = EnvironmentEdgeManager.currentTime(); 138 long origSize = b.getByteSize(); 139 for (int i = 0; i < 1 * 1000 * 1000; ++i) { 140 byte[] ib = Bytes.toBytes(i); 141 b.add(ib, 0, ib.length); 142 } 143 long endTime = EnvironmentEdgeManager.currentTime(); 144 System.out.println("Total Add time = " + (endTime - startTime) + "ms"); 145 146 // fold 147 startTime = EnvironmentEdgeManager.currentTime(); 148 b.compactBloom(); 149 endTime = EnvironmentEdgeManager.currentTime(); 150 System.out.println("Total Fold time = " + (endTime - startTime) + "ms"); 151 assertTrue(origSize >= b.getByteSize() << 3); 152 153 // test 154 startTime = EnvironmentEdgeManager.currentTime(); 155 int falsePositives = 0; 156 for (int i = 0; i < 2 * 1000 * 1000; ++i) { 157 158 byte[] bytes = Bytes.toBytes(i); 159 if ( 160 BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0, 161 (int) b.byteSize, b.hash, b.hashCount) 162 ) { 163 if (i >= 1 * 1000 * 1000) falsePositives++; 164 } else { 165 assertFalse(i < 1 * 1000 * 1000); 166 } 167 } 168 endTime = EnvironmentEdgeManager.currentTime(); 169 System.out.println("Total Contains time = " + (endTime - startTime) + "ms"); 170 System.out.println("False Positive = " + falsePositives); 171 assertTrue(falsePositives <= (1 * 1000 * 1000) * err); 172 173 // test: foldFactor > log(max/actual) 174 } 175 176 @Test 177 public void testSizing() { 178 int bitSize = 8 * 128 * 1024; // 128 KB 179 double errorRate = 0.025; // target false positive rate 180 181 // How many keys can we store in a Bloom filter of this size maintaining 182 // the given false positive rate, not taking into account that the n 183 long maxKeys = BloomFilterUtil.idealMaxKeys(bitSize, errorRate); 184 assertEquals(136570, maxKeys); 185 186 // A reverse operation: how many bits would we need to store this many keys 187 // and keep the same low false positive rate? 188 long bitSize2 = BloomFilterUtil.computeBitSize(maxKeys, errorRate); 189 190 // The bit size comes out a little different due to rounding. 191 assertTrue(Math.abs(bitSize2 - bitSize) * 1.0 / bitSize < 1e-5); 192 } 193 194 @Test 195 public void testFoldableByteSize() { 196 assertEquals(128, BloomFilterUtil.computeFoldableByteSize(1000, 5)); 197 assertEquals(640, BloomFilterUtil.computeFoldableByteSize(5001, 4)); 198 } 199}