001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.filter; 019 020import static org.junit.Assert.*; 021 022import java.util.regex.Pattern; 023import org.apache.hadoop.hbase.HBaseClassTestRule; 024import org.apache.hadoop.hbase.filter.RegexStringComparator.EngineType; 025import org.apache.hadoop.hbase.testclassification.FilterTests; 026import org.apache.hadoop.hbase.testclassification.SmallTests; 027import org.apache.hadoop.hbase.util.Bytes; 028import org.junit.ClassRule; 029import org.junit.Test; 030import org.junit.experimental.categories.Category; 031 032@Category({FilterTests.class, SmallTests.class}) 033public class TestRegexComparator { 034 035 @ClassRule 036 public static final HBaseClassTestRule CLASS_RULE = 037 HBaseClassTestRule.forClass(TestRegexComparator.class); 038 039 @Test 040 public void testSerialization() throws Exception { 041 // Default engine is the Java engine 042 RegexStringComparator a = new RegexStringComparator("a|b"); 043 RegexStringComparator b = RegexStringComparator.parseFrom(a.toByteArray()); 044 assertTrue(a.areSerializedFieldsEqual(b)); 045 assertTrue(b.getEngine() instanceof RegexStringComparator.JavaRegexEngine); 046 047 // joni engine 048 a = new RegexStringComparator("a|b", EngineType.JONI); 049 b = RegexStringComparator.parseFrom(a.toByteArray()); 050 assertTrue(a.areSerializedFieldsEqual(b)); 051 assertTrue(b.getEngine() instanceof RegexStringComparator.JoniRegexEngine); 052 } 053 054 @Test 055 public void testJavaEngine() throws Exception { 056 for (TestCase t: TEST_CASES) { 057 boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JAVA) 058 .compareTo(Bytes.toBytes(t.haystack)) == 0; 059 assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result, 060 t.expected); 061 } 062 } 063 064 @Test 065 public void testJoniEngine() throws Exception { 066 for (TestCase t: TEST_CASES) { 067 boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JONI) 068 .compareTo(Bytes.toBytes(t.haystack)) == 0; 069 assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result, 070 t.expected); 071 } 072 } 073 074 private static class TestCase { 075 String regex; 076 String haystack; 077 int flags; 078 boolean expected; 079 080 public TestCase(String regex, String haystack, boolean expected) { 081 this(regex, Pattern.DOTALL, haystack, expected); 082 } 083 084 public TestCase(String regex, int flags, String haystack, boolean expected) { 085 this.regex = regex; 086 this.flags = flags; 087 this.haystack = haystack; 088 this.expected = expected; 089 } 090 } 091 092 // These are a subset of the regex tests from OpenJDK 7 093 private static TestCase TEST_CASES[] = { 094 new TestCase("a|b", "a", true), 095 new TestCase("a|b", "b", true), 096 new TestCase("a|b", Pattern.CASE_INSENSITIVE, "A", true), 097 new TestCase("a|b", Pattern.CASE_INSENSITIVE, "B", true), 098 new TestCase("a|b", "z", false), 099 new TestCase("a|b|cd", "cd", true), 100 new TestCase("z(a|ac)b", "zacb", true), 101 new TestCase("[abc]+", "ababab", true), 102 new TestCase("[abc]+", "defg", false), 103 new TestCase("[abc]+[def]+[ghi]+", "zzzaaddggzzz", true), 104 new TestCase("[a-\\u4444]+", "za-9z", true), 105 new TestCase("[^abc]+", "ababab", false), 106 new TestCase("[^abc]+", "aaabbbcccdefg", true), 107 new TestCase("[abc^b]", "b", true), 108 new TestCase("[abc[def]]", "b", true), 109 new TestCase("[abc[def]]", "e", true), 110 new TestCase("[a-c[d-f[g-i]]]", "h", true), 111 new TestCase("[a-c[d-f[g-i]]m]", "m", true), 112 new TestCase("[a-c&&[d-f]]", "a", false), 113 new TestCase("[a-c&&[d-f]]", "z", false), 114 new TestCase("[a-m&&m-z&&a-c]", "m", false), 115 new TestCase("[a-m&&m-z&&a-z]", "m", true), 116 new TestCase("[[a-m]&&[^a-c]]", "a", false), 117 new TestCase("[[a-m]&&[^a-c]]", "d", true), 118 new TestCase("[[a-c][d-f]&&abc[def]]", "e", true), 119 new TestCase("[[a-c]&&[b-d]&&[c-e]]", "c", true), 120 new TestCase("[[a-c]&&[b-d][c-e]&&[u-z]]", "c", false), 121 new TestCase("[[a]&&[b][c][a]&&[^d]]", "a", true), 122 new TestCase("[[a]&&[b][c][a]&&[^d]]", "d", false), 123 new TestCase("[[[a-d]&&[c-f]]&&[c]&&c&&[cde]]", "c", true), 124 new TestCase("[x[[wz]abc&&bcd[z]]&&[u-z]]", "z", true), 125 new TestCase("a.c.+", "a#c%&", true), 126 new TestCase("ab.", "ab\n", true), 127 new TestCase("(?s)ab.", "ab\n", true), 128 new TestCase("ab\\wc", "abcc", true), 129 new TestCase("\\W\\w\\W", "#r#", true), 130 new TestCase("\\W\\w\\W", "rrrr#ggg", false), 131 new TestCase("abc[\\sdef]*", "abc def", true), 132 new TestCase("abc[\\sy-z]*", "abc y z", true), 133 new TestCase("abc[a-d\\sm-p]*", "abcaa mn p", true), 134 new TestCase("\\s\\s\\s", "blah err", false), 135 new TestCase("\\S\\S\\s", "blah err", true), 136 new TestCase("ab\\dc", "ab9c", true), 137 new TestCase("\\d\\d\\d", "blah45", false), 138 new TestCase("^abc", "abcdef", true), 139 new TestCase("^abc", "bcdabc", false), 140 new TestCase("^(a)?a", "a", true), 141 new TestCase("^(aa(bb)?)+$", "aabbaa", true), 142 new TestCase("((a|b)?b)+", "b", true), 143 new TestCase("^(a(b)?)+$", "aba", true), 144 new TestCase("^(a(b(c)?)?)?abc", "abc", true), 145 new TestCase("^(a(b(c))).*", "abc", true), 146 new TestCase("a?b", "aaaab", true), 147 new TestCase("a?b", "aaacc", false), 148 new TestCase("a??b", "aaaab", true), 149 new TestCase("a??b", "aaacc", false), 150 new TestCase("a?+b", "aaaab", true), 151 new TestCase("a?+b", "aaacc", false), 152 new TestCase("a+b", "aaaab", true), 153 new TestCase("a+b", "aaacc", false), 154 new TestCase("a+?b", "aaaab", true), 155 new TestCase("a+?b", "aaacc", false), 156 new TestCase("a++b", "aaaab", true), 157 new TestCase("a++b", "aaacc", false), 158 new TestCase("a{2,3}", "a", false), 159 new TestCase("a{2,3}", "aa", true), 160 new TestCase("a{2,3}", "aaa", true), 161 new TestCase("a{3,}", "zzzaaaazzz", true), 162 new TestCase("a{3,}", "zzzaazzz", false), 163 new TestCase("abc(?=d)", "zzzabcd", true), 164 new TestCase("abc(?=d)", "zzzabced", false), 165 new TestCase("abc(?!d)", "zzabcd", false), 166 new TestCase("abc(?!d)", "zzabced", true), 167 new TestCase("\\w(?<=a)", "###abc###", true), 168 new TestCase("\\w(?<=a)", "###ert###", false), 169 new TestCase("(?<!a)c", "bc", true), 170 new TestCase("(?<!a)c", "ac", false), 171 new TestCase("(a+b)+", "ababab", true), 172 new TestCase("(a+b)+", "accccd", false), 173 new TestCase("(ab)+", "ababab", true), 174 new TestCase("(ab)+", "accccd", false), 175 new TestCase("(ab)(cd*)", "zzzabczzz", true), 176 new TestCase("abc(d)*abc", "abcdddddabc", true), 177 new TestCase("a*b", "aaaab", true), 178 new TestCase("a*b", "b", true), 179 new TestCase("a*b", "aaaac", false), 180 new TestCase(".*?b", "aaaab", true), 181 new TestCase("a*+b", "aaaab", true), 182 new TestCase("a*+b", "b", true), 183 new TestCase("a*+b", "aaaac", false), 184 new TestCase("(?i)foobar", "fOobAr", true), 185 new TestCase("f(?i)oobar", "fOobAr", true), 186 new TestCase("f(?i)oobar", "FOobAr", false), 187 new TestCase("foo(?i)bar", "fOobAr", false), 188 new TestCase("(?i)foo[bar]+", "foObAr", true), 189 new TestCase("(?i)foo[a-r]+", "foObAr", true), 190 new TestCase("abc(?x)blah", "abcblah", true), 191 new TestCase("abc(?x) blah", "abcblah", true), 192 new TestCase("abc(?x) blah blech", "abcblahblech", true), 193 new TestCase("[\\n-#]", "!", true), 194 new TestCase("[\\n-#]", "-", false), 195 new TestCase("[\\043]+", "blahblah#blech", true), 196 new TestCase("[\\042-\\044]+", "blahblah#blech", true), 197 new TestCase("[\\u1234-\\u1236]", "blahblah\u1235blech", true), 198 new TestCase("[^\043]*", "blahblah#blech", true), 199 new TestCase("(|f)?+", "foo", true), 200 }; 201}