001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.util; 019 020import java.io.ByteArrayOutputStream; 021import java.io.DataOutputStream; 022import java.nio.ByteBuffer; 023import junit.framework.TestCase; 024import org.apache.hadoop.hbase.HBaseClassTestRule; 025import org.apache.hadoop.hbase.nio.MultiByteBuff; 026import org.apache.hadoop.hbase.testclassification.MiscTests; 027import org.apache.hadoop.hbase.testclassification.SmallTests; 028import org.junit.ClassRule; 029import org.junit.experimental.categories.Category; 030 031@Category({MiscTests.class, SmallTests.class}) 032public class TestBloomFilterChunk extends TestCase { 033 034 @ClassRule 035 public static final HBaseClassTestRule CLASS_RULE = 036 HBaseClassTestRule.forClass(TestBloomFilterChunk.class); 037 038 public void testBasicBloom() throws Exception { 039 BloomFilterChunk bf1 = new BloomFilterChunk(1000, (float)0.01, Hash.MURMUR_HASH, 0); 040 BloomFilterChunk bf2 = new BloomFilterChunk(1000, (float)0.01, Hash.MURMUR_HASH, 0); 041 bf1.allocBloom(); 042 bf2.allocBloom(); 043 044 // test 1: verify no fundamental false negatives or positives 045 byte[] key1 = {1,2,3,4,5,6,7,8,9}; 046 byte[] key2 = {1,2,3,4,5,6,7,8,7}; 047 048 bf1.add(key1, 0, key1.length); 049 bf2.add(key2, 0, key2.length); 050 051 assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf1.bloom), 0, 052 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 053 assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf1.bloom), 0, 054 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 055 assertFalse(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf2.bloom), 0, 056 (int) bf2.byteSize, bf2.hash, bf2.hashCount)); 057 assertTrue(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf2.bloom), 0, 058 (int) bf2.byteSize, bf2.hash, bf2.hashCount)); 059 060 byte [] bkey = {1,2,3,4}; 061 byte [] bval = "this is a much larger byte array".getBytes(); 062 063 bf1.add(bkey, 0, bkey.length); 064 bf1.add(bval, 1, bval.length-1); 065 066 assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bf1.bloom), 0, 067 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 068 assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bf1.bloom), 069 0, (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 070 assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bf1.bloom), 0, 071 (int) bf1.byteSize, bf1.hash, bf1.hashCount)); 072 073 // test 2: serialization & deserialization. 074 // (convert bloom to byte array & read byte array back in as input) 075 ByteArrayOutputStream bOut = new ByteArrayOutputStream(); 076 bf1.writeBloom(new DataOutputStream(bOut)); 077 ByteBuffer bb = ByteBuffer.wrap(bOut.toByteArray()); 078 BloomFilterChunk newBf1 = new BloomFilterChunk(1000, (float)0.01, 079 Hash.MURMUR_HASH, 0); 080 assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bb), 0, 081 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 082 assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bb), 0, 083 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 084 assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bb), 0, 085 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 086 assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bb), 0, 087 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 088 assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0, 089 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 090 assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0, 091 (int) newBf1.byteSize, newBf1.hash, newBf1.hashCount)); 092 093 System.out.println("Serialized as " + bOut.size() + " bytes"); 094 assertTrue(bOut.size() - bf1.byteSize < 10); //... allow small padding 095 } 096 097 public void testBloomFold() throws Exception { 098 // test: foldFactor < log(max/actual) 099 BloomFilterChunk b = new BloomFilterChunk(1003, (float) 0.01, 100 Hash.MURMUR_HASH, 2); 101 b.allocBloom(); 102 long origSize = b.getByteSize(); 103 assertEquals(1204, origSize); 104 for (int i = 0; i < 12; ++i) { 105 byte[] ib = Bytes.toBytes(i); 106 b.add(ib, 0, ib.length); 107 } 108 b.compactBloom(); 109 assertEquals(origSize>>2, b.getByteSize()); 110 int falsePositives = 0; 111 for (int i = 0; i < 25; ++i) { 112 byte[] bytes = Bytes.toBytes(i); 113 if (BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0, 114 (int) b.byteSize, b.hash, b.hashCount)) { 115 if (i >= 12) 116 falsePositives++; 117 } else { 118 assertFalse(i < 12); 119 } 120 } 121 assertTrue(falsePositives <= 1); 122 123 // test: foldFactor > log(max/actual) 124 } 125 126 public void testBloomPerf() throws Exception { 127 // add 128 float err = (float)0.01; 129 BloomFilterChunk b = new BloomFilterChunk(10*1000*1000, (float)err, Hash.MURMUR_HASH, 3); 130 b.allocBloom(); 131 long startTime = System.currentTimeMillis(); 132 long origSize = b.getByteSize(); 133 for (int i = 0; i < 1*1000*1000; ++i) { 134 byte[] ib = Bytes.toBytes(i); 135 b.add(ib, 0, ib.length); 136 } 137 long endTime = System.currentTimeMillis(); 138 System.out.println("Total Add time = " + (endTime - startTime) + "ms"); 139 140 // fold 141 startTime = System.currentTimeMillis(); 142 b.compactBloom(); 143 endTime = System.currentTimeMillis(); 144 System.out.println("Total Fold time = " + (endTime - startTime) + "ms"); 145 assertTrue(origSize >= b.getByteSize()<<3); 146 147 // test 148 startTime = System.currentTimeMillis(); 149 int falsePositives = 0; 150 for (int i = 0; i < 2*1000*1000; ++i) { 151 152 byte[] bytes = Bytes.toBytes(i); 153 if (BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0, 154 (int) b.byteSize, b.hash, b.hashCount)) { 155 if (i >= 1 * 1000 * 1000) 156 falsePositives++; 157 } else { 158 assertFalse(i < 1*1000*1000); 159 } 160 } 161 endTime = System.currentTimeMillis(); 162 System.out.println("Total Contains time = " + (endTime - startTime) + "ms"); 163 System.out.println("False Positive = " + falsePositives); 164 assertTrue(falsePositives <= (1*1000*1000)*err); 165 166 // test: foldFactor > log(max/actual) 167 } 168 169 public void testSizing() { 170 int bitSize = 8 * 128 * 1024; // 128 KB 171 double errorRate = 0.025; // target false positive rate 172 173 // How many keys can we store in a Bloom filter of this size maintaining 174 // the given false positive rate, not taking into account that the n 175 long maxKeys = BloomFilterUtil.idealMaxKeys(bitSize, errorRate); 176 assertEquals(136570, maxKeys); 177 178 // A reverse operation: how many bits would we need to store this many keys 179 // and keep the same low false positive rate? 180 long bitSize2 = BloomFilterUtil.computeBitSize(maxKeys, errorRate); 181 182 // The bit size comes out a little different due to rounding. 183 assertTrue(Math.abs(bitSize2 - bitSize) * 1.0 / bitSize < 1e-5); 184 } 185 186 public void testFoldableByteSize() { 187 assertEquals(128, BloomFilterUtil.computeFoldableByteSize(1000, 5)); 188 assertEquals(640, BloomFilterUtil.computeFoldableByteSize(5001, 4)); 189 } 190 191 192} 193