001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.filter;
019
020import static org.junit.Assert.*;
021
022import java.util.regex.Pattern;
023import org.apache.hadoop.hbase.HBaseClassTestRule;
024import org.apache.hadoop.hbase.filter.RegexStringComparator.EngineType;
025import org.apache.hadoop.hbase.testclassification.FilterTests;
026import org.apache.hadoop.hbase.testclassification.SmallTests;
027import org.apache.hadoop.hbase.util.Bytes;
028import org.junit.ClassRule;
029import org.junit.Test;
030import org.junit.experimental.categories.Category;
031
032@Category({FilterTests.class, SmallTests.class})
033public class TestRegexComparator {
034
035  @ClassRule
036  public static final HBaseClassTestRule CLASS_RULE =
037      HBaseClassTestRule.forClass(TestRegexComparator.class);
038
039  @Test
040  public void testSerialization() throws Exception {
041    // Default engine is the Java engine
042    RegexStringComparator a = new RegexStringComparator("a|b");
043    RegexStringComparator b = RegexStringComparator.parseFrom(a.toByteArray());
044    assertTrue(a.areSerializedFieldsEqual(b));
045    assertTrue(b.getEngine() instanceof RegexStringComparator.JavaRegexEngine);
046
047    // joni engine
048    a = new RegexStringComparator("a|b", EngineType.JONI);
049    b = RegexStringComparator.parseFrom(a.toByteArray());
050    assertTrue(a.areSerializedFieldsEqual(b));
051    assertTrue(b.getEngine() instanceof RegexStringComparator.JoniRegexEngine);
052  }
053
054  @Test
055  public void testJavaEngine() throws Exception {
056    for (TestCase t: TEST_CASES) {
057      boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JAVA)
058        .compareTo(Bytes.toBytes(t.haystack)) == 0;
059      assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result,
060        t.expected);
061    }
062  }
063
064  @Test
065  public void testJoniEngine() throws Exception {
066    for (TestCase t: TEST_CASES) {
067      boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JONI)
068        .compareTo(Bytes.toBytes(t.haystack)) == 0;
069      assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result,
070        t.expected);
071    }
072  }
073
074  private static class TestCase {
075    String regex;
076    String haystack;
077    int flags;
078    boolean expected;
079
080    public TestCase(String regex, String haystack, boolean expected) {
081      this(regex, Pattern.DOTALL, haystack, expected);
082    }
083
084    public TestCase(String regex, int flags, String haystack, boolean expected) {
085      this.regex = regex;
086      this.flags = flags;
087      this.haystack = haystack;
088      this.expected = expected;
089    }
090  }
091
092  // These are a subset of the regex tests from OpenJDK 7
093  private static TestCase TEST_CASES[] = {
094    new TestCase("a|b", "a", true),
095    new TestCase("a|b", "b", true),
096    new TestCase("a|b", Pattern.CASE_INSENSITIVE, "A", true),
097    new TestCase("a|b", Pattern.CASE_INSENSITIVE, "B", true),
098    new TestCase("a|b", "z", false),
099    new TestCase("a|b|cd", "cd", true),
100    new TestCase("z(a|ac)b", "zacb", true),
101    new TestCase("[abc]+", "ababab", true),
102    new TestCase("[abc]+", "defg", false),
103    new TestCase("[abc]+[def]+[ghi]+", "zzzaaddggzzz", true),
104    new TestCase("[a-\\u4444]+", "za-9z", true),
105    new TestCase("[^abc]+", "ababab", false),
106    new TestCase("[^abc]+", "aaabbbcccdefg", true),
107    new TestCase("[abc^b]", "b", true),
108    new TestCase("[abc[def]]", "b", true),
109    new TestCase("[abc[def]]", "e", true),
110    new TestCase("[a-c[d-f[g-i]]]", "h", true),
111    new TestCase("[a-c[d-f[g-i]]m]", "m", true),
112    new TestCase("[a-c&&[d-f]]", "a", false),
113    new TestCase("[a-c&&[d-f]]", "z", false),
114    new TestCase("[a-m&&m-z&&a-c]", "m", false),
115    new TestCase("[a-m&&m-z&&a-z]", "m", true),
116    new TestCase("[[a-m]&&[^a-c]]", "a", false),
117    new TestCase("[[a-m]&&[^a-c]]", "d", true),
118    new TestCase("[[a-c][d-f]&&abc[def]]", "e", true),
119    new TestCase("[[a-c]&&[b-d]&&[c-e]]", "c", true),
120    new TestCase("[[a-c]&&[b-d][c-e]&&[u-z]]", "c", false),
121    new TestCase("[[a]&&[b][c][a]&&[^d]]", "a", true),
122    new TestCase("[[a]&&[b][c][a]&&[^d]]", "d", false),
123    new TestCase("[[[a-d]&&[c-f]]&&[c]&&c&&[cde]]", "c", true),
124    new TestCase("[x[[wz]abc&&bcd[z]]&&[u-z]]", "z", true),
125    new TestCase("a.c.+", "a#c%&", true),
126    new TestCase("ab.", "ab\n", true),
127    new TestCase("(?s)ab.", "ab\n", true),
128    new TestCase("ab\\wc", "abcc", true),
129    new TestCase("\\W\\w\\W", "#r#", true),
130    new TestCase("\\W\\w\\W", "rrrr#ggg", false),
131    new TestCase("abc[\\sdef]*", "abc  def", true),
132    new TestCase("abc[\\sy-z]*", "abc y z", true),
133    new TestCase("abc[a-d\\sm-p]*", "abcaa mn  p", true),
134    new TestCase("\\s\\s\\s", "blah  err", false),
135    new TestCase("\\S\\S\\s", "blah  err", true),
136    new TestCase("ab\\dc", "ab9c", true),
137    new TestCase("\\d\\d\\d", "blah45", false),
138    new TestCase("^abc", "abcdef", true),
139    new TestCase("^abc", "bcdabc", false),
140    new TestCase("^(a)?a", "a", true),
141    new TestCase("^(aa(bb)?)+$", "aabbaa", true),
142    new TestCase("((a|b)?b)+", "b", true),
143    new TestCase("^(a(b)?)+$", "aba", true),
144    new TestCase("^(a(b(c)?)?)?abc", "abc", true),
145    new TestCase("^(a(b(c))).*", "abc", true),
146    new TestCase("a?b", "aaaab", true),
147    new TestCase("a?b", "aaacc", false),
148    new TestCase("a??b", "aaaab", true),
149    new TestCase("a??b", "aaacc", false),
150    new TestCase("a?+b", "aaaab", true),
151    new TestCase("a?+b", "aaacc", false),
152    new TestCase("a+b", "aaaab", true),
153    new TestCase("a+b", "aaacc", false),
154    new TestCase("a+?b", "aaaab", true),
155    new TestCase("a+?b", "aaacc", false),
156    new TestCase("a++b", "aaaab", true),
157    new TestCase("a++b", "aaacc", false),
158    new TestCase("a{2,3}", "a", false),
159    new TestCase("a{2,3}", "aa", true),
160    new TestCase("a{2,3}", "aaa", true),
161    new TestCase("a{3,}", "zzzaaaazzz", true),
162    new TestCase("a{3,}", "zzzaazzz", false),
163    new TestCase("abc(?=d)", "zzzabcd", true),
164    new TestCase("abc(?=d)", "zzzabced", false),
165    new TestCase("abc(?!d)", "zzabcd", false),
166    new TestCase("abc(?!d)", "zzabced", true),
167    new TestCase("\\w(?<=a)", "###abc###", true),
168    new TestCase("\\w(?<=a)", "###ert###", false),
169    new TestCase("(?<!a)c", "bc", true),
170    new TestCase("(?<!a)c", "ac", false),
171    new TestCase("(a+b)+", "ababab", true),
172    new TestCase("(a+b)+", "accccd", false),
173    new TestCase("(ab)+", "ababab", true),
174    new TestCase("(ab)+", "accccd", false),
175    new TestCase("(ab)(cd*)", "zzzabczzz", true),
176    new TestCase("abc(d)*abc", "abcdddddabc", true),
177    new TestCase("a*b", "aaaab", true),
178    new TestCase("a*b", "b", true),
179    new TestCase("a*b", "aaaac", false),
180    new TestCase(".*?b", "aaaab", true),
181    new TestCase("a*+b", "aaaab", true),
182    new TestCase("a*+b", "b", true),
183    new TestCase("a*+b", "aaaac", false),
184    new TestCase("(?i)foobar", "fOobAr", true),
185    new TestCase("f(?i)oobar", "fOobAr", true),
186    new TestCase("f(?i)oobar", "FOobAr", false),
187    new TestCase("foo(?i)bar", "fOobAr", false),
188    new TestCase("(?i)foo[bar]+", "foObAr", true),
189    new TestCase("(?i)foo[a-r]+", "foObAr", true),
190    new TestCase("abc(?x)blah", "abcblah", true),
191    new TestCase("abc(?x)  blah", "abcblah", true),
192    new TestCase("abc(?x)  blah  blech", "abcblahblech", true),
193    new TestCase("[\\n-#]", "!", true),
194    new TestCase("[\\n-#]", "-", false),
195    new TestCase("[\\043]+", "blahblah#blech", true),
196    new TestCase("[\\042-\\044]+", "blahblah#blech", true),
197    new TestCase("[\\u1234-\\u1236]", "blahblah\u1235blech", true),
198    new TestCase("[^\043]*", "blahblah#blech", true),
199    new TestCase("(|f)?+", "foo", true),
200  };
201}