001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.filter;
019
020import java.nio.charset.Charset;
021import java.nio.charset.IllegalCharsetNameException;
022import java.util.Arrays;
023import java.util.regex.Pattern;
024import org.apache.hadoop.hbase.exceptions.DeserializationException;
025import org.apache.hadoop.hbase.util.Bytes;
026import org.apache.yetus.audience.InterfaceAudience;
027import org.jcodings.Encoding;
028import org.jcodings.EncodingDB;
029import org.jcodings.specific.NonStrictUTF8Encoding;
030import org.joni.Matcher;
031import org.joni.Option;
032import org.joni.Regex;
033import org.joni.Syntax;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException;
038
039import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos;
040
041/**
042 * This comparator is for use with {@link CompareFilter} implementations, such as {@link RowFilter},
043 * {@link QualifierFilter}, and {@link ValueFilter}, for filtering based on the value of a given
044 * column. Use it to test if a given regular expression matches a cell value in the column.
045 * <p>
046 * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator.
047 * <p>
048 * For example:
049 * <p>
050 *
051 * <pre>
052 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, new RegexStringComparator(
053 *   // v4 IP address
054 *   "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}"
055 *     + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" + "|" +
056 *     // v6 IP address
057 *     "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)"
058 *     + "{3}[\\d]{1,3})?)(\\/[0-9]+)?"));
059 * </pre>
060 * <p>
061 * Supports {@link java.util.regex.Pattern} flags as well:
062 * <p>
063 *
064 * <pre>
065 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
066 *   new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
067 * </pre>
068 *
069 * @see java.util.regex.Pattern
070 */
071@InterfaceAudience.Public
072@SuppressWarnings("ComparableType") // Should this move to Comparator usage?
073public class RegexStringComparator extends ByteArrayComparable {
074
075  private static final Logger LOG = LoggerFactory.getLogger(RegexStringComparator.class);
076
077  private Engine engine;
078
079  /** Engine implementation type (default=JAVA) */
080  @InterfaceAudience.Public
081  public enum EngineType {
082    JAVA,
083    JONI
084  }
085
086  /**
087   * Constructor Adds Pattern.DOTALL to the underlying Pattern
088   * @param expr a valid regular expression
089   */
090  public RegexStringComparator(String expr) {
091    this(expr, Pattern.DOTALL);
092  }
093
094  /**
095   * Constructor Adds Pattern.DOTALL to the underlying Pattern
096   * @param expr   a valid regular expression
097   * @param engine engine implementation type
098   */
099  public RegexStringComparator(String expr, EngineType engine) {
100    this(expr, Pattern.DOTALL, engine);
101  }
102
103  /**
104   * Constructor
105   * @param expr  a valid regular expression
106   * @param flags java.util.regex.Pattern flags
107   */
108  public RegexStringComparator(String expr, int flags) {
109    this(expr, flags, EngineType.JAVA);
110  }
111
112  /**
113   * Constructor
114   * @param expr   a valid regular expression
115   * @param flags  java.util.regex.Pattern flags
116   * @param engine engine implementation type
117   */
118  public RegexStringComparator(String expr, int flags, EngineType engine) {
119    super(Bytes.toBytes(expr));
120    switch (engine) {
121      case JAVA:
122        this.engine = new JavaRegexEngine(expr, flags);
123        break;
124      case JONI:
125        this.engine = new JoniRegexEngine(expr, flags);
126        break;
127    }
128  }
129
130  /**
131   * Specifies the {@link Charset} to use to convert the row key to a String.
132   * <p>
133   * The row key needs to be converted to a String in order to be matched against the regular
134   * expression. This method controls which charset is used to do this conversion.
135   * <p>
136   * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} is recommended.
137   * @param charset The charset to use.
138   */
139  public void setCharset(final Charset charset) {
140    engine.setCharset(charset.name());
141  }
142
143  @Override
144  public int compareTo(byte[] value, int offset, int length) {
145    return engine.compareTo(value, offset, length);
146  }
147
148  /** Returns The comparator serialized using pb */
149  @Override
150  public byte[] toByteArray() {
151    return engine.toByteArray();
152  }
153
154  /**
155   * Parse a serialized representation of {@link RegexStringComparator}
156   * @param pbBytes A pb serialized {@link RegexStringComparator} instance
157   * @return An instance of {@link RegexStringComparator} made from <code>bytes</code>
158   * @throws DeserializationException if an error occurred
159   * @see #toByteArray
160   */
161  public static RegexStringComparator parseFrom(final byte[] pbBytes)
162    throws DeserializationException {
163    ComparatorProtos.RegexStringComparator proto;
164    try {
165      proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
166    } catch (InvalidProtocolBufferException e) {
167      throw new DeserializationException(e);
168    }
169    RegexStringComparator comparator;
170    if (proto.hasEngine()) {
171      EngineType engine = EngineType.valueOf(proto.getEngine());
172      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(), engine);
173    } else {
174      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
175    }
176    String charset = proto.getCharset();
177    if (charset.length() > 0) {
178      try {
179        comparator.getEngine().setCharset(charset);
180      } catch (IllegalCharsetNameException e) {
181        LOG.error("invalid charset", e);
182      }
183    }
184    return comparator;
185  }
186
187  /**
188   * Returns true if and only if the fields of the comparator that are serialized are equal to the
189   * corresponding fields in other. Used for testing.
190   */
191  @Override
192  boolean areSerializedFieldsEqual(ByteArrayComparable other) {
193    if (other == this) {
194      return true;
195    }
196    if (!(other instanceof RegexStringComparator)) {
197      return false;
198    }
199    RegexStringComparator comparator = (RegexStringComparator) other;
200    return super.areSerializedFieldsEqual(comparator)
201      && engine.getClass().isInstance(comparator.getEngine())
202      && engine.getPattern().equals(comparator.getEngine().getPattern())
203      && engine.getFlags() == comparator.getEngine().getFlags()
204      && engine.getCharset().equals(comparator.getEngine().getCharset());
205  }
206
207  Engine getEngine() {
208    return engine;
209  }
210
211  /**
212   * This is an internal interface for abstracting access to different regular expression matching
213   * engines.
214   */
215  static interface Engine {
216    /**
217     * Returns the string representation of the configured regular expression for matching
218     */
219    String getPattern();
220
221    /**
222     * Returns the set of configured match flags, a bit mask that may include {@link Pattern} flags
223     */
224    int getFlags();
225
226    /**
227     * Returns the name of the configured charset
228     */
229    String getCharset();
230
231    /**
232     * Set the charset used when matching
233     * @param charset the name of the desired charset for matching
234     */
235    void setCharset(final String charset);
236
237    /**
238     * Return the serialized form of the configured matcher
239     */
240    byte[] toByteArray();
241
242    /**
243     * Match the given input against the configured pattern
244     * @param value  the data to be matched
245     * @param offset offset of the data to be matched
246     * @param length length of the data to be matched
247     * @return 0 if a match was made, 1 otherwise
248     */
249    int compareTo(byte[] value, int offset, int length);
250  }
251
252  /**
253   * Implementation of the Engine interface using Java's Pattern.
254   * <p>
255   * This is the default engine.
256   */
257  static class JavaRegexEngine implements Engine {
258    private Charset charset = Charset.forName("UTF-8");
259    private Pattern pattern;
260
261    public JavaRegexEngine(String regex, int flags) {
262      this.pattern = Pattern.compile(regex, flags);
263    }
264
265    @Override
266    public String getPattern() {
267      return pattern.toString();
268    }
269
270    @Override
271    public int getFlags() {
272      return pattern.flags();
273    }
274
275    @Override
276    public String getCharset() {
277      return charset.name();
278    }
279
280    @Override
281    public void setCharset(String charset) {
282      this.charset = Charset.forName(charset);
283    }
284
285    @Override
286    public int compareTo(byte[] value, int offset, int length) {
287      // Use find() for subsequence match instead of matches() (full sequence
288      // match) to adhere to the principle of least surprise.
289      String tmp;
290      if (length < value.length / 2) {
291        // See HBASE-9428. Make a copy of the relevant part of the byte[],
292        // or the JDK will copy the entire byte[] during String decode
293        tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
294      } else {
295        tmp = new String(value, offset, length, charset);
296      }
297      return pattern.matcher(tmp).find() ? 0 : 1;
298    }
299
300    @Override
301    public byte[] toByteArray() {
302      ComparatorProtos.RegexStringComparator.Builder builder =
303        ComparatorProtos.RegexStringComparator.newBuilder();
304      builder.setPattern(pattern.pattern());
305      builder.setPatternFlags(pattern.flags());
306      builder.setCharset(charset.name());
307      builder.setEngine(EngineType.JAVA.name());
308      return builder.build().toByteArray();
309    }
310  }
311
312  /**
313   * Implementation of the Engine interface using Jruby's joni regex engine.
314   * <p>
315   * This engine operates on byte arrays directly so is expected to be more GC friendly, and
316   * reportedly is twice as fast as Java's Pattern engine.
317   * <p>
318   * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and MULTILINE are supported.
319   */
320  static class JoniRegexEngine implements Engine {
321    // When using UTF8Encoding, an infinite loop can occur if an invalid UTF8 is encountered.
322    // Use NonStrictUTF8Encoding instead of UTF8Encoding to avoid the issue.
323    private Encoding encoding = NonStrictUTF8Encoding.INSTANCE;
324    private String regex;
325    private Regex pattern;
326
327    public JoniRegexEngine(String regex, int flags) {
328      this.regex = regex;
329      byte[] b = Bytes.toBytes(regex);
330      this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
331    }
332
333    @Override
334    public String getPattern() {
335      return regex;
336    }
337
338    @Override
339    public int getFlags() {
340      return pattern.getOptions();
341    }
342
343    @Override
344    public String getCharset() {
345      return encoding.getCharsetName();
346    }
347
348    @Override
349    public void setCharset(String name) {
350      setEncoding(name);
351    }
352
353    @Override
354    public int compareTo(byte[] value, int offset, int length) {
355      // Use subsequence match instead of full sequence match to adhere to the
356      // principle of least surprise.
357      Matcher m = pattern.matcher(value);
358      return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
359    }
360
361    @Override
362    public byte[] toByteArray() {
363      ComparatorProtos.RegexStringComparator.Builder builder =
364        ComparatorProtos.RegexStringComparator.newBuilder();
365      builder.setPattern(regex);
366      builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
367      builder.setCharset(encoding.getCharsetName());
368      builder.setEngine(EngineType.JONI.name());
369      return builder.build().toByteArray();
370    }
371
372    private int patternToJoniFlags(int flags) {
373      int newFlags = 0;
374      if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
375        newFlags |= Option.IGNORECASE;
376      }
377      if ((flags & Pattern.DOTALL) != 0) {
378        // This does NOT mean Pattern.MULTILINE
379        newFlags |= Option.MULTILINE;
380      }
381      if ((flags & Pattern.MULTILINE) != 0) {
382        // This is what Java 8's Nashorn engine does when using joni and
383        // translating Pattern's MULTILINE flag
384        newFlags &= ~Option.SINGLELINE;
385        newFlags |= Option.NEGATE_SINGLELINE;
386      }
387      return newFlags;
388    }
389
390    private int joniToPatternFlags(int flags) {
391      int newFlags = 0;
392      if ((flags & Option.IGNORECASE) != 0) {
393        newFlags |= Pattern.CASE_INSENSITIVE;
394      }
395      // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
396      if ((flags & Option.MULTILINE) != 0) {
397        newFlags |= Pattern.DOTALL;
398      }
399      // This means Pattern.MULTILINE. Nice
400      if ((flags & Option.NEGATE_SINGLELINE) != 0) {
401        newFlags |= Pattern.MULTILINE;
402      }
403      return newFlags;
404    }
405
406    private void setEncoding(String name) {
407      EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
408      if (e != null) {
409        encoding = e.getEncoding();
410      } else {
411        throw new IllegalCharsetNameException(name);
412      }
413    }
414  }
415}