001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.filter;
020
021import java.nio.charset.Charset;
022import java.nio.charset.IllegalCharsetNameException;
023import java.util.Arrays;
024import java.util.regex.Pattern;
025
026import org.apache.yetus.audience.InterfaceAudience;
027import org.apache.hadoop.hbase.exceptions.DeserializationException;
028import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos;
029import org.apache.hadoop.hbase.util.Bytes;
030import org.jcodings.Encoding;
031import org.jcodings.EncodingDB;
032import org.jcodings.specific.UTF8Encoding;
033import org.joni.Matcher;
034import org.joni.Option;
035import org.joni.Regex;
036import org.joni.Syntax;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException;
040
041/**
042 * This comparator is for use with {@link CompareFilter} implementations, such
043 * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for
044 * filtering based on the value of a given column. Use it to test if a given
045 * regular expression matches a cell value in the column.
046 * <p>
047 * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator.
048 * <p>
049 * For example:
050 * <p>
051 * <pre>
052 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
053 *     new RegexStringComparator(
054 *       // v4 IP address
055 *       "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" +
056 *         "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" +
057 *         "|" +
058 *       // v6 IP address
059 *       "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" +
060 *         "{3}[\\d]{1,3})?)(\\/[0-9]+)?"));
061 * </pre>
062 * <p>
063 * Supports {@link java.util.regex.Pattern} flags as well:
064 * <p>
065 * <pre>
066 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
067 *     new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
068 * </pre>
069 * @see java.util.regex.Pattern
070 */
071@InterfaceAudience.Public
072@SuppressWarnings("ComparableType") // Should this move to Comparator usage?
073public class RegexStringComparator extends ByteArrayComparable {
074
075  private static final Logger LOG = LoggerFactory.getLogger(RegexStringComparator.class);
076
077  private Engine engine;
078
079  /** Engine implementation type (default=JAVA) */
080  @InterfaceAudience.Public
081  public enum EngineType {
082    JAVA,
083    JONI
084  }
085
086  /**
087   * Constructor
088   * Adds Pattern.DOTALL to the underlying Pattern
089   * @param expr a valid regular expression
090   */
091  public RegexStringComparator(String expr) {
092    this(expr, Pattern.DOTALL);
093  }
094
095  /**
096   * Constructor
097   * Adds Pattern.DOTALL to the underlying Pattern
098   * @param expr a valid regular expression
099   * @param engine engine implementation type
100   */
101  public RegexStringComparator(String expr, EngineType engine) {
102    this(expr, Pattern.DOTALL, engine);
103  }
104
105  /**
106   * Constructor
107   * @param expr a valid regular expression
108   * @param flags java.util.regex.Pattern flags
109   */
110  public RegexStringComparator(String expr, int flags) {
111    this(expr, flags, EngineType.JAVA);
112  }
113
114  /**
115   * Constructor
116   * @param expr a valid regular expression
117   * @param flags java.util.regex.Pattern flags
118   * @param engine engine implementation type
119   */
120  public RegexStringComparator(String expr, int flags, EngineType engine) {
121    super(Bytes.toBytes(expr));
122    switch (engine) {
123      case JAVA:
124        this.engine = new JavaRegexEngine(expr, flags);
125        break;
126      case JONI:
127        this.engine = new JoniRegexEngine(expr, flags);
128        break;
129    }
130  }
131
132  /**
133   * Specifies the {@link Charset} to use to convert the row key to a String.
134   * <p>
135   * The row key needs to be converted to a String in order to be matched
136   * against the regular expression.  This method controls which charset is
137   * used to do this conversion.
138   * <p>
139   * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1}
140   * is recommended.
141   * @param charset The charset to use.
142   */
143  public void setCharset(final Charset charset) {
144    engine.setCharset(charset.name());
145  }
146
147  @Override
148  public int compareTo(byte[] value, int offset, int length) {
149    return engine.compareTo(value, offset, length);
150  }
151
152  /**
153   * @return The comparator serialized using pb
154   */
155  @Override
156  public byte [] toByteArray() {
157    return engine.toByteArray();
158  }
159
160  /**
161   * @param pbBytes A pb serialized {@link RegexStringComparator} instance
162   * @return An instance of {@link RegexStringComparator} made from <code>bytes</code>
163   * @throws DeserializationException
164   * @see #toByteArray
165   */
166  public static RegexStringComparator parseFrom(final byte [] pbBytes)
167  throws DeserializationException {
168    ComparatorProtos.RegexStringComparator proto;
169    try {
170      proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
171    } catch (InvalidProtocolBufferException e) {
172      throw new DeserializationException(e);
173    }
174    RegexStringComparator comparator;
175    if (proto.hasEngine()) {
176      EngineType engine = EngineType.valueOf(proto.getEngine());
177      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(),
178        engine);
179    } else {
180      comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
181    }
182    String charset = proto.getCharset();
183    if (charset.length() > 0) {
184      try {
185        comparator.getEngine().setCharset(charset);
186      } catch (IllegalCharsetNameException e) {
187        LOG.error("invalid charset", e);
188      }
189    }
190    return comparator;
191  }
192
193  /**
194   * @param other
195   * @return true if and only if the fields of the comparator that are serialized
196   * are equal to the corresponding fields in other.  Used for testing.
197   */
198  @Override
199  boolean areSerializedFieldsEqual(ByteArrayComparable other) {
200    if (other == this) return true;
201    if (!(other instanceof RegexStringComparator)) return false;
202    RegexStringComparator comparator = (RegexStringComparator)other;
203    return super.areSerializedFieldsEqual(comparator)
204      && engine.getClass().isInstance(comparator.getEngine())
205      && engine.getPattern().equals(comparator.getEngine().getPattern())
206      && engine.getFlags() == comparator.getEngine().getFlags()
207      && engine.getCharset().equals(comparator.getEngine().getCharset());
208  }
209
210  Engine getEngine() {
211    return engine;
212  }
213
214  /**
215   * This is an internal interface for abstracting access to different regular
216   * expression matching engines.
217   */
218  static interface Engine {
219    /**
220     * Returns the string representation of the configured regular expression
221     * for matching
222     */
223    String getPattern();
224
225    /**
226     * Returns the set of configured match flags, a bit mask that may include
227     * {@link Pattern} flags
228     */
229    int getFlags();
230
231    /**
232     * Returns the name of the configured charset
233     */
234    String getCharset();
235
236    /**
237     * Set the charset used when matching
238     * @param charset the name of the desired charset for matching
239     */
240    void setCharset(final String charset);
241
242    /**
243     * Return the serialized form of the configured matcher
244     */
245    byte [] toByteArray();
246
247    /**
248     * Match the given input against the configured pattern
249     * @param value the data to be matched
250     * @param offset offset of the data to be matched
251     * @param length length of the data to be matched
252     * @return 0 if a match was made, 1 otherwise
253     */
254    int compareTo(byte[] value, int offset, int length);
255  }
256
257  /**
258   * Implementation of the Engine interface using Java's Pattern.
259   * <p>
260   * This is the default engine.
261   */
262  static class JavaRegexEngine implements Engine {
263    private Charset charset = Charset.forName("UTF-8");
264    private Pattern pattern;
265
266    public JavaRegexEngine(String regex, int flags) {
267      this.pattern = Pattern.compile(regex, flags);
268    }
269
270    @Override
271    public String getPattern() {
272      return pattern.toString();
273    }
274
275    @Override
276    public int getFlags() {
277      return pattern.flags();
278    }
279
280    @Override
281    public String getCharset() {
282      return charset.name();
283    }
284
285    @Override
286    public void setCharset(String charset) {
287      this.charset = Charset.forName(charset);
288    }
289
290    @Override
291    public int compareTo(byte[] value, int offset, int length) {
292      // Use find() for subsequence match instead of matches() (full sequence
293      // match) to adhere to the principle of least surprise.
294      String tmp;
295      if (length < value.length / 2) {
296        // See HBASE-9428. Make a copy of the relevant part of the byte[],
297        // or the JDK will copy the entire byte[] during String decode
298        tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
299      } else {
300        tmp = new String(value, offset, length, charset);
301      }
302      return pattern.matcher(tmp).find() ? 0 : 1;
303    }
304
305    @Override
306    public byte[] toByteArray() {
307      ComparatorProtos.RegexStringComparator.Builder builder =
308          ComparatorProtos.RegexStringComparator.newBuilder();
309      builder.setPattern(pattern.pattern());
310      builder.setPatternFlags(pattern.flags());
311      builder.setCharset(charset.name());
312      builder.setEngine(EngineType.JAVA.name());
313      return builder.build().toByteArray();
314    }
315  }
316
317  /**
318   * Implementation of the Engine interface using Jruby's joni regex engine.
319   * <p>
320   * This engine operates on byte arrays directly so is expected to be more GC
321   * friendly, and reportedly is twice as fast as Java's Pattern engine.
322   * <p>
323   * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and
324   * MULTILINE are supported.
325   */
326  static class JoniRegexEngine implements Engine {
327    private Encoding encoding = UTF8Encoding.INSTANCE;
328    private String regex;
329    private Regex pattern;
330
331    public JoniRegexEngine(String regex, int flags) {
332      this.regex = regex;
333      byte[] b = Bytes.toBytes(regex);
334      this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
335    }
336
337    @Override
338    public String getPattern() {
339      return regex;
340    }
341
342    @Override
343    public int getFlags() {
344      return pattern.getOptions();
345    }
346
347    @Override
348    public String getCharset() {
349      return encoding.getCharsetName();
350    }
351
352    @Override
353    public void setCharset(String name) {
354      setEncoding(name);
355    }
356
357    @Override
358    public int compareTo(byte[] value, int offset, int length) {
359      // Use subsequence match instead of full sequence match to adhere to the
360      // principle of least surprise.
361      Matcher m = pattern.matcher(value);
362      return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
363    }
364
365    @Override
366    public byte[] toByteArray() {
367      ComparatorProtos.RegexStringComparator.Builder builder =
368          ComparatorProtos.RegexStringComparator.newBuilder();
369        builder.setPattern(regex);
370        builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
371        builder.setCharset(encoding.getCharsetName());
372        builder.setEngine(EngineType.JONI.name());
373        return builder.build().toByteArray();
374    }
375
376    private int patternToJoniFlags(int flags) {
377      int newFlags = 0;
378      if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
379        newFlags |= Option.IGNORECASE;
380      }
381      if ((flags & Pattern.DOTALL) != 0) {
382        // This does NOT mean Pattern.MULTILINE
383        newFlags |= Option.MULTILINE;
384      }
385      if ((flags & Pattern.MULTILINE) != 0) {
386        // This is what Java 8's Nashorn engine does when using joni and
387        // translating Pattern's MULTILINE flag
388        newFlags &= ~Option.SINGLELINE;
389        newFlags |= Option.NEGATE_SINGLELINE;
390      }
391      return newFlags;
392    }
393
394    private int joniToPatternFlags(int flags) {
395      int newFlags = 0;
396      if ((flags & Option.IGNORECASE) != 0) {
397        newFlags |= Pattern.CASE_INSENSITIVE;
398      }
399      // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
400      if ((flags & Option.MULTILINE) != 0) {
401        newFlags |= Pattern.DOTALL;
402      }
403      // This means Pattern.MULTILINE. Nice
404      if ((flags & Option.NEGATE_SINGLELINE) != 0) {
405        newFlags |= Pattern.MULTILINE;
406      }
407      return newFlags;
408    }
409
410    private void setEncoding(String name) {
411      EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
412      if (e != null) {
413        encoding = e.getEncoding();
414      } else {
415        throw new IllegalCharsetNameException(name);
416      }
417    }
418  }
419}