001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.filter;
019
020import java.nio.charset.Charset;
021import java.nio.charset.IllegalCharsetNameException;
022import java.util.Arrays;
023import java.util.regex.Pattern;
024import org.apache.hadoop.hbase.exceptions.DeserializationException;
025import org.apache.hadoop.hbase.util.Bytes;
026import org.apache.yetus.audience.InterfaceAudience;
027import org.jcodings.Encoding;
028import org.jcodings.EncodingDB;
029import org.jcodings.specific.UTF8Encoding;
030import org.joni.Matcher;
031import org.joni.Option;
032import org.joni.Regex;
033import org.joni.Syntax;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException;
038
039import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos;
040
041/**
042 * This comparator is for use with {@link CompareFilter} implementations, such as {@link RowFilter},
043 * {@link QualifierFilter}, and {@link ValueFilter}, for filtering based on the value of a given
044 * column. Use it to test if a given regular expression matches a cell value in the column.
045 * <p>
046 * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator.
047 * <p>
048 * For example:
049 * <p>
050 *
051 * <pre>
052 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, new RegexStringComparator(
053 *   // v4 IP address
054 *   "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}"
055 *     + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" + "|" +
056 *     // v6 IP address
057 *     "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)"
058 *     + "{3}[\\d]{1,3})?)(\\/[0-9]+)?"));
059 * </pre>
060 * <p>
061 * Supports {@link java.util.regex.Pattern} flags as well:
062 * <p>
063 *
064 * <pre>
065 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
066 *   new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
067 * </pre>
068 *
069 * @see java.util.regex.Pattern
070 */
071@InterfaceAudience.Public
072@SuppressWarnings("ComparableType") // Should this move to Comparator usage?
073public class RegexStringComparator extends ByteArrayComparable {
074
075  private static final Logger LOG = LoggerFactory.getLogger(RegexStringComparator.class);
076
077  private Engine engine;
078
079  /** Engine implementation type (default=JAVA) */
080  @InterfaceAudience.Public
081  public enum EngineType {
082    JAVA,
083    JONI
084  }
085
086  /**
087   * Constructor Adds Pattern.DOTALL to the underlying Pattern
088   * @param expr a valid regular expression
089   */
090  public RegexStringComparator(String expr) {
091    this(expr, Pattern.DOTALL);
092  }
093
094  /**
095   * Constructor Adds Pattern.DOTALL to the underlying Pattern
096   * @param expr   a valid regular expression
097   * @param engine engine implementation type
098   */
099  public RegexStringComparator(String expr, EngineType engine) {
100    this(expr, Pattern.DOTALL, engine);
101  }
102
103  /**
104   * Constructor
105   * @param expr  a valid regular expression
106   * @param flags java.util.regex.Pattern flags
107   */
108  public RegexStringComparator(String expr, int flags) {
109    this(expr, flags, EngineType.JAVA);
110  }
111
112  /**
113   * Constructor
114   * @param expr   a valid regular expression
115   * @param flags  java.util.regex.Pattern flags
116   * @param engine engine implementation type
117   */
118  public RegexStringComparator(String expr, int flags, EngineType engine) {
119    super(Bytes.toBytes(expr));
120    switch (engine) {
121      case JAVA:
122        this.engine = new JavaRegexEngine(expr, flags);
123        break;
124      case JONI:
125        this.engine = new JoniRegexEngine(expr, flags);
126        break;
127    }
128  }
129
130  /**
131   * Specifies the {@link Charset} to use to convert the row key to a String.
132   * <p>
133   * The row key needs to be converted to a String in order to be matched against the regular
134   * expression. This method controls which charset is used to do this conversion.
135   * <p>
136   * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} is recommended.
137   * @param charset The charset to use.
138   */
139  public void setCharset(final Charset charset) {
140    engine.setCharset(charset.name());
141  }
142
143  @Override
144  public int compareTo(byte[] value, int offset, int length) {
145    return engine.compareTo(value, offset, length);
146  }
147
148  /**
149   * @return The comparator serialized using pb
150   */
151  @Override
152  public byte[] toByteArray() {
153    return engine.toByteArray();
154  }
155
156  /**
157   * @param pbBytes A pb serialized {@link RegexStringComparator} instance
158   * @return An instance of {@link RegexStringComparator} made from <code>bytes</code> n * @see
159   *         #toByteArray
160   */
161  public static RegexStringComparator parseFrom(final byte[] pbBytes)
162    throws DeserializationException {
163    ComparatorProtos.RegexStringComparator proto;
164    try {
165      proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
166    } catch (InvalidProtocolBufferException e) {
167      throw new DeserializationException(e);
168    }
169    RegexStringComparator comparator;
170    if (proto.hasEngine()) {
171      EngineType engine = EngineType.valueOf(proto.getEngine());
172      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(), engine);
173    } else {
174      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
175    }
176    String charset = proto.getCharset();
177    if (charset.length() > 0) {
178      try {
179        comparator.getEngine().setCharset(charset);
180      } catch (IllegalCharsetNameException e) {
181        LOG.error("invalid charset", e);
182      }
183    }
184    return comparator;
185  }
186
187  /**
188   * n * @return true if and only if the fields of the comparator that are serialized are equal to
189   * the corresponding fields in other. Used for testing.
190   */
191  @Override
192  boolean areSerializedFieldsEqual(ByteArrayComparable other) {
193    if (other == this) return true;
194    if (!(other instanceof RegexStringComparator)) return false;
195    RegexStringComparator comparator = (RegexStringComparator) other;
196    return super.areSerializedFieldsEqual(comparator)
197      && engine.getClass().isInstance(comparator.getEngine())
198      && engine.getPattern().equals(comparator.getEngine().getPattern())
199      && engine.getFlags() == comparator.getEngine().getFlags()
200      && engine.getCharset().equals(comparator.getEngine().getCharset());
201  }
202
203  Engine getEngine() {
204    return engine;
205  }
206
207  /**
208   * This is an internal interface for abstracting access to different regular expression matching
209   * engines.
210   */
211  static interface Engine {
212    /**
213     * Returns the string representation of the configured regular expression for matching
214     */
215    String getPattern();
216
217    /**
218     * Returns the set of configured match flags, a bit mask that may include {@link Pattern} flags
219     */
220    int getFlags();
221
222    /**
223     * Returns the name of the configured charset
224     */
225    String getCharset();
226
227    /**
228     * Set the charset used when matching
229     * @param charset the name of the desired charset for matching
230     */
231    void setCharset(final String charset);
232
233    /**
234     * Return the serialized form of the configured matcher
235     */
236    byte[] toByteArray();
237
238    /**
239     * Match the given input against the configured pattern
240     * @param value  the data to be matched
241     * @param offset offset of the data to be matched
242     * @param length length of the data to be matched
243     * @return 0 if a match was made, 1 otherwise
244     */
245    int compareTo(byte[] value, int offset, int length);
246  }
247
248  /**
249   * Implementation of the Engine interface using Java's Pattern.
250   * <p>
251   * This is the default engine.
252   */
253  static class JavaRegexEngine implements Engine {
254    private Charset charset = Charset.forName("UTF-8");
255    private Pattern pattern;
256
257    public JavaRegexEngine(String regex, int flags) {
258      this.pattern = Pattern.compile(regex, flags);
259    }
260
261    @Override
262    public String getPattern() {
263      return pattern.toString();
264    }
265
266    @Override
267    public int getFlags() {
268      return pattern.flags();
269    }
270
271    @Override
272    public String getCharset() {
273      return charset.name();
274    }
275
276    @Override
277    public void setCharset(String charset) {
278      this.charset = Charset.forName(charset);
279    }
280
281    @Override
282    public int compareTo(byte[] value, int offset, int length) {
283      // Use find() for subsequence match instead of matches() (full sequence
284      // match) to adhere to the principle of least surprise.
285      String tmp;
286      if (length < value.length / 2) {
287        // See HBASE-9428. Make a copy of the relevant part of the byte[],
288        // or the JDK will copy the entire byte[] during String decode
289        tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
290      } else {
291        tmp = new String(value, offset, length, charset);
292      }
293      return pattern.matcher(tmp).find() ? 0 : 1;
294    }
295
296    @Override
297    public byte[] toByteArray() {
298      ComparatorProtos.RegexStringComparator.Builder builder =
299        ComparatorProtos.RegexStringComparator.newBuilder();
300      builder.setPattern(pattern.pattern());
301      builder.setPatternFlags(pattern.flags());
302      builder.setCharset(charset.name());
303      builder.setEngine(EngineType.JAVA.name());
304      return builder.build().toByteArray();
305    }
306  }
307
308  /**
309   * Implementation of the Engine interface using Jruby's joni regex engine.
310   * <p>
311   * This engine operates on byte arrays directly so is expected to be more GC friendly, and
312   * reportedly is twice as fast as Java's Pattern engine.
313   * <p>
314   * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and MULTILINE are supported.
315   */
316  static class JoniRegexEngine implements Engine {
317    private Encoding encoding = UTF8Encoding.INSTANCE;
318    private String regex;
319    private Regex pattern;
320
321    public JoniRegexEngine(String regex, int flags) {
322      this.regex = regex;
323      byte[] b = Bytes.toBytes(regex);
324      this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
325    }
326
327    @Override
328    public String getPattern() {
329      return regex;
330    }
331
332    @Override
333    public int getFlags() {
334      return pattern.getOptions();
335    }
336
337    @Override
338    public String getCharset() {
339      return encoding.getCharsetName();
340    }
341
342    @Override
343    public void setCharset(String name) {
344      setEncoding(name);
345    }
346
347    @Override
348    public int compareTo(byte[] value, int offset, int length) {
349      // Use subsequence match instead of full sequence match to adhere to the
350      // principle of least surprise.
351      Matcher m = pattern.matcher(value);
352      return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
353    }
354
355    @Override
356    public byte[] toByteArray() {
357      ComparatorProtos.RegexStringComparator.Builder builder =
358        ComparatorProtos.RegexStringComparator.newBuilder();
359      builder.setPattern(regex);
360      builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
361      builder.setCharset(encoding.getCharsetName());
362      builder.setEngine(EngineType.JONI.name());
363      return builder.build().toByteArray();
364    }
365
366    private int patternToJoniFlags(int flags) {
367      int newFlags = 0;
368      if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
369        newFlags |= Option.IGNORECASE;
370      }
371      if ((flags & Pattern.DOTALL) != 0) {
372        // This does NOT mean Pattern.MULTILINE
373        newFlags |= Option.MULTILINE;
374      }
375      if ((flags & Pattern.MULTILINE) != 0) {
376        // This is what Java 8's Nashorn engine does when using joni and
377        // translating Pattern's MULTILINE flag
378        newFlags &= ~Option.SINGLELINE;
379        newFlags |= Option.NEGATE_SINGLELINE;
380      }
381      return newFlags;
382    }
383
384    private int joniToPatternFlags(int flags) {
385      int newFlags = 0;
386      if ((flags & Option.IGNORECASE) != 0) {
387        newFlags |= Pattern.CASE_INSENSITIVE;
388      }
389      // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
390      if ((flags & Option.MULTILINE) != 0) {
391        newFlags |= Pattern.DOTALL;
392      }
393      // This means Pattern.MULTILINE. Nice
394      if ((flags & Option.NEGATE_SINGLELINE) != 0) {
395        newFlags |= Pattern.MULTILINE;
396      }
397      return newFlags;
398    }
399
400    private void setEncoding(String name) {
401      EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
402      if (e != null) {
403        encoding = e.getEncoding();
404      } else {
405        throw new IllegalCharsetNameException(name);
406      }
407    }
408  }
409}