001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.filter;
019
020import static org.junit.Assert.*;
021
022import java.util.regex.Pattern;
023import org.apache.hadoop.hbase.HBaseClassTestRule;
024import org.apache.hadoop.hbase.filter.RegexStringComparator.EngineType;
025import org.apache.hadoop.hbase.testclassification.FilterTests;
026import org.apache.hadoop.hbase.testclassification.SmallTests;
027import org.apache.hadoop.hbase.util.Bytes;
028import org.junit.ClassRule;
029import org.junit.Test;
030import org.junit.experimental.categories.Category;
031
032@Category({ FilterTests.class, SmallTests.class })
033public class TestRegexComparator {
034
035  @ClassRule
036  public static final HBaseClassTestRule CLASS_RULE =
037    HBaseClassTestRule.forClass(TestRegexComparator.class);
038
039  @Test
040  public void testSerialization() throws Exception {
041    // Default engine is the Java engine
042    RegexStringComparator a = new RegexStringComparator("a|b");
043    RegexStringComparator b = RegexStringComparator.parseFrom(a.toByteArray());
044    assertTrue(a.areSerializedFieldsEqual(b));
045    assertTrue(b.getEngine() instanceof RegexStringComparator.JavaRegexEngine);
046
047    // joni engine
048    a = new RegexStringComparator("a|b", EngineType.JONI);
049    b = RegexStringComparator.parseFrom(a.toByteArray());
050    assertTrue(a.areSerializedFieldsEqual(b));
051    assertTrue(b.getEngine() instanceof RegexStringComparator.JoniRegexEngine);
052  }
053
054  @Test
055  public void testJavaEngine() throws Exception {
056    for (TestCase t : TEST_CASES) {
057      boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JAVA)
058        .compareTo(Bytes.toBytes(t.haystack)) == 0;
059      assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result, t.expected);
060    }
061  }
062
063  @Test
064  public void testJoniEngine() throws Exception {
065    for (TestCase t : TEST_CASES) {
066      boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JONI)
067        .compareTo(Bytes.toBytes(t.haystack)) == 0;
068      assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result, t.expected);
069    }
070  }
071
072  private static class TestCase {
073    String regex;
074    String haystack;
075    int flags;
076    boolean expected;
077
078    public TestCase(String regex, String haystack, boolean expected) {
079      this(regex, Pattern.DOTALL, haystack, expected);
080    }
081
082    public TestCase(String regex, int flags, String haystack, boolean expected) {
083      this.regex = regex;
084      this.flags = flags;
085      this.haystack = haystack;
086      this.expected = expected;
087    }
088  }
089
090  // These are a subset of the regex tests from OpenJDK 7
091  private static TestCase TEST_CASES[] = { new TestCase("a|b", "a", true),
092    new TestCase("a|b", "b", true), new TestCase("a|b", Pattern.CASE_INSENSITIVE, "A", true),
093    new TestCase("a|b", Pattern.CASE_INSENSITIVE, "B", true), new TestCase("a|b", "z", false),
094    new TestCase("a|b|cd", "cd", true), new TestCase("z(a|ac)b", "zacb", true),
095    new TestCase("[abc]+", "ababab", true), new TestCase("[abc]+", "defg", false),
096    new TestCase("[abc]+[def]+[ghi]+", "zzzaaddggzzz", true),
097    new TestCase("[a-\\u4444]+", "za-9z", true), new TestCase("[^abc]+", "ababab", false),
098    new TestCase("[^abc]+", "aaabbbcccdefg", true), new TestCase("[abc^b]", "b", true),
099    new TestCase("[abc[def]]", "b", true), new TestCase("[abc[def]]", "e", true),
100    new TestCase("[a-c[d-f[g-i]]]", "h", true), new TestCase("[a-c[d-f[g-i]]m]", "m", true),
101    new TestCase("[a-c&&[d-f]]", "a", false), new TestCase("[a-c&&[d-f]]", "z", false),
102    new TestCase("[a-m&&m-z&&a-c]", "m", false), new TestCase("[a-m&&m-z&&a-z]", "m", true),
103    new TestCase("[[a-m]&&[^a-c]]", "a", false), new TestCase("[[a-m]&&[^a-c]]", "d", true),
104    new TestCase("[[a-c][d-f]&&abc[def]]", "e", true),
105    new TestCase("[[a-c]&&[b-d]&&[c-e]]", "c", true),
106    new TestCase("[[a-c]&&[b-d][c-e]&&[u-z]]", "c", false),
107    new TestCase("[[a]&&[b][c][a]&&[^d]]", "a", true),
108    new TestCase("[[a]&&[b][c][a]&&[^d]]", "d", false),
109    new TestCase("[[[a-d]&&[c-f]]&&[c]&&c&&[cde]]", "c", true),
110    new TestCase("[x[[wz]abc&&bcd[z]]&&[u-z]]", "z", true), new TestCase("a.c.+", "a#c%&", true),
111    new TestCase("ab.", "ab\n", true), new TestCase("(?s)ab.", "ab\n", true),
112    new TestCase("ab\\wc", "abcc", true), new TestCase("\\W\\w\\W", "#r#", true),
113    new TestCase("\\W\\w\\W", "rrrr#ggg", false), new TestCase("abc[\\sdef]*", "abc  def", true),
114    new TestCase("abc[\\sy-z]*", "abc y z", true),
115    new TestCase("abc[a-d\\sm-p]*", "abcaa mn  p", true),
116    new TestCase("\\s\\s\\s", "blah  err", false), new TestCase("\\S\\S\\s", "blah  err", true),
117    new TestCase("ab\\dc", "ab9c", true), new TestCase("\\d\\d\\d", "blah45", false),
118    new TestCase("^abc", "abcdef", true), new TestCase("^abc", "bcdabc", false),
119    new TestCase("^(a)?a", "a", true), new TestCase("^(aa(bb)?)+$", "aabbaa", true),
120    new TestCase("((a|b)?b)+", "b", true), new TestCase("^(a(b)?)+$", "aba", true),
121    new TestCase("^(a(b(c)?)?)?abc", "abc", true), new TestCase("^(a(b(c))).*", "abc", true),
122    new TestCase("a?b", "aaaab", true), new TestCase("a?b", "aaacc", false),
123    new TestCase("a??b", "aaaab", true), new TestCase("a??b", "aaacc", false),
124    new TestCase("a?+b", "aaaab", true), new TestCase("a?+b", "aaacc", false),
125    new TestCase("a+b", "aaaab", true), new TestCase("a+b", "aaacc", false),
126    new TestCase("a+?b", "aaaab", true), new TestCase("a+?b", "aaacc", false),
127    new TestCase("a++b", "aaaab", true), new TestCase("a++b", "aaacc", false),
128    new TestCase("a{2,3}", "a", false), new TestCase("a{2,3}", "aa", true),
129    new TestCase("a{2,3}", "aaa", true), new TestCase("a{3,}", "zzzaaaazzz", true),
130    new TestCase("a{3,}", "zzzaazzz", false), new TestCase("abc(?=d)", "zzzabcd", true),
131    new TestCase("abc(?=d)", "zzzabced", false), new TestCase("abc(?!d)", "zzabcd", false),
132    new TestCase("abc(?!d)", "zzabced", true), new TestCase("\\w(?<=a)", "###abc###", true),
133    new TestCase("\\w(?<=a)", "###ert###", false), new TestCase("(?<!a)c", "bc", true),
134    new TestCase("(?<!a)c", "ac", false), new TestCase("(a+b)+", "ababab", true),
135    new TestCase("(a+b)+", "accccd", false), new TestCase("(ab)+", "ababab", true),
136    new TestCase("(ab)+", "accccd", false), new TestCase("(ab)(cd*)", "zzzabczzz", true),
137    new TestCase("abc(d)*abc", "abcdddddabc", true), new TestCase("a*b", "aaaab", true),
138    new TestCase("a*b", "b", true), new TestCase("a*b", "aaaac", false),
139    new TestCase(".*?b", "aaaab", true), new TestCase("a*+b", "aaaab", true),
140    new TestCase("a*+b", "b", true), new TestCase("a*+b", "aaaac", false),
141    new TestCase("(?i)foobar", "fOobAr", true), new TestCase("f(?i)oobar", "fOobAr", true),
142    new TestCase("f(?i)oobar", "FOobAr", false), new TestCase("foo(?i)bar", "fOobAr", false),
143    new TestCase("(?i)foo[bar]+", "foObAr", true), new TestCase("(?i)foo[a-r]+", "foObAr", true),
144    new TestCase("abc(?x)blah", "abcblah", true), new TestCase("abc(?x)  blah", "abcblah", true),
145    new TestCase("abc(?x)  blah  blech", "abcblahblech", true), new TestCase("[\\n-#]", "!", true),
146    new TestCase("[\\n-#]", "-", false), new TestCase("[\\043]+", "blahblah#blech", true),
147    new TestCase("[\\042-\\044]+", "blahblah#blech", true),
148    new TestCase("[\\u1234-\\u1236]", "blahblah\u1235blech", true),
149    new TestCase("[^\043]*", "blahblah#blech", true), new TestCase("(|f)?+", "foo", true), };
150}