View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.filter;
20  
21  import java.nio.charset.Charset;
22  import java.nio.charset.IllegalCharsetNameException;
23  import java.util.Arrays;
24  import java.util.regex.Pattern;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.hbase.classification.InterfaceAudience;
29  import org.apache.hadoop.hbase.classification.InterfaceStability;
30  import org.apache.hadoop.hbase.exceptions.DeserializationException;
31  import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos;
32  import org.apache.hadoop.hbase.util.Bytes;
33  import org.jcodings.Encoding;
34  import org.jcodings.EncodingDB;
35  import org.jcodings.specific.UTF8Encoding;
36  import org.joni.Matcher;
37  import org.joni.Option;
38  import org.joni.Regex;
39  import org.joni.Syntax;
40  
41  import com.google.protobuf.InvalidProtocolBufferException;
42  
43  /**
44   * This comparator is for use with {@link CompareFilter} implementations, such
45   * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for
46   * filtering based on the value of a given column. Use it to test if a given
47   * regular expression matches a cell value in the column.
48   * <p>
49   * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator.
50   * <p>
51   * For example:
52   * <p>
53   * <pre>
54   * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
55   *     new RegexStringComparator(
56   *       // v4 IP address
57   *       "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" +
58   *         "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" +
59   *         "|" +
60   *       // v6 IP address
61   *       "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" +
62   *         "{3}[\\d]{1,3})?)(\\/[0-9]+)?"));
63   * </pre>
64   * <p>
65   * Supports {@link java.util.regex.Pattern} flags as well:
66   * <p>
67   * <pre>
68   * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
69   *     new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
70   * </pre>
71   * @see java.util.regex.Pattern
72   */
73  @InterfaceAudience.Public
74  @InterfaceStability.Stable
75  public class RegexStringComparator extends ByteArrayComparable {
76  
77    private static final Log LOG = LogFactory.getLog(RegexStringComparator.class);
78  
79    private Engine engine;
80  
81    /** Engine implementation type (default=JAVA) */
82    @InterfaceAudience.Public
83    @InterfaceStability.Stable
84    public enum EngineType {
85      JAVA,
86      JONI
87    }
88  
89    /**
90     * Constructor
91     * Adds Pattern.DOTALL to the underlying Pattern
92     * @param expr a valid regular expression
93     */
94    public RegexStringComparator(String expr) {
95      this(expr, Pattern.DOTALL);
96    }
97  
98    /**
99     * Constructor
100    * Adds Pattern.DOTALL to the underlying Pattern
101    * @param expr a valid regular expression
102    * @param engine engine implementation type
103    */
104   public RegexStringComparator(String expr, EngineType engine) {
105     this(expr, Pattern.DOTALL, engine);
106   }
107 
108   /**
109    * Constructor
110    * @param expr a valid regular expression
111    * @param flags java.util.regex.Pattern flags
112    */
113   public RegexStringComparator(String expr, int flags) {
114     this(expr, flags, EngineType.JAVA);
115   }
116 
117   /**
118    * Constructor
119    * @param expr a valid regular expression
120    * @param flags java.util.regex.Pattern flags
121    * @param engine engine implementation type
122    */
123   public RegexStringComparator(String expr, int flags, EngineType engine) {
124     super(Bytes.toBytes(expr));
125     switch (engine) {
126       case JAVA:
127         this.engine = new JavaRegexEngine(expr, flags);
128         break;
129       case JONI:
130         this.engine = new JoniRegexEngine(expr, flags);
131         break;
132     }
133   }
134 
135   /**
136    * Specifies the {@link Charset} to use to convert the row key to a String.
137    * <p>
138    * The row key needs to be converted to a String in order to be matched
139    * against the regular expression.  This method controls which charset is
140    * used to do this conversion.
141    * <p>
142    * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1}
143    * is recommended.
144    * @param charset The charset to use.
145    */
146   public void setCharset(final Charset charset) {
147     engine.setCharset(charset.name());
148   }
149 
150   @Override
151   public int compareTo(byte[] value, int offset, int length) {
152     return engine.compareTo(value, offset, length);
153   }
154 
155   /**
156    * @return The comparator serialized using pb
157    */
158   @Override
159   public byte [] toByteArray() {
160     return engine.toByteArray();
161   }
162 
163   /**
164    * @param pbBytes A pb serialized {@link RegexStringComparator} instance
165    * @return An instance of {@link RegexStringComparator} made from <code>bytes</code>
166    * @throws DeserializationException
167    * @see #toByteArray
168    */
169   public static RegexStringComparator parseFrom(final byte [] pbBytes)
170   throws DeserializationException {
171     ComparatorProtos.RegexStringComparator proto;
172     try {
173       proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
174     } catch (InvalidProtocolBufferException e) {
175       throw new DeserializationException(e);
176     }
177     RegexStringComparator comparator;
178     if (proto.hasEngine()) {
179       EngineType engine = EngineType.valueOf(proto.getEngine());
180       comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(),
181         engine);
182     } else {
183       comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
184     }
185     String charset = proto.getCharset();
186     if (charset.length() > 0) {
187       try {
188         comparator.getEngine().setCharset(charset);
189       } catch (IllegalCharsetNameException e) {
190         LOG.error("invalid charset", e);
191       }
192     }
193     return comparator;
194   }
195 
196   /**
197    * @param other
198    * @return true if and only if the fields of the comparator that are serialized
199    * are equal to the corresponding fields in other.  Used for testing.
200    */
201   @Override
202   boolean areSerializedFieldsEqual(ByteArrayComparable other) {
203     if (other == this) return true;
204     if (!(other instanceof RegexStringComparator)) return false;
205     RegexStringComparator comparator = (RegexStringComparator)other;
206     return super.areSerializedFieldsEqual(comparator)
207       && engine.getClass().isInstance(comparator.getEngine())
208       && engine.getPattern().equals(comparator.getEngine().getPattern())
209       && engine.getFlags() == comparator.getEngine().getFlags()
210       && engine.getCharset().equals(comparator.getEngine().getCharset());
211   }
212 
213   Engine getEngine() {
214     return engine;
215   }
216 
217   /**
218    * This is an internal interface for abstracting access to different regular
219    * expression matching engines.
220    */
221   static interface Engine {
222     /**
223      * Returns the string representation of the configured regular expression
224      * for matching
225      */
226     String getPattern();
227 
228     /**
229      * Returns the set of configured match flags, a bit mask that may include
230      * {@link Pattern} flags
231      */
232     int getFlags();
233 
234     /**
235      * Returns the name of the configured charset
236      */
237     String getCharset();
238 
239     /**
240      * Set the charset used when matching
241      * @param charset the name of the desired charset for matching
242      */
243     void setCharset(final String charset);
244 
245     /**
246      * Return the serialized form of the configured matcher
247      */
248     byte [] toByteArray();
249 
250     /**
251      * Match the given input against the configured pattern
252      * @param value the data to be matched
253      * @param offset offset of the data to be matched
254      * @param length length of the data to be matched
255      * @return 0 if a match was made, 1 otherwise
256      */
257     int compareTo(byte[] value, int offset, int length);
258   }
259 
260   /**
261    * Implementation of the Engine interface using Java's Pattern.
262    * <p>
263    * This is the default engine.
264    */
265   static class JavaRegexEngine implements Engine {
266     private Charset charset = Charset.forName("UTF-8");
267     private Pattern pattern;
268 
269     public JavaRegexEngine(String regex, int flags) {
270       this.pattern = Pattern.compile(regex, flags);
271     }
272 
273     @Override
274     public String getPattern() {
275       return pattern.toString();
276     }
277 
278     @Override
279     public int getFlags() {
280       return pattern.flags();
281     }
282 
283     @Override
284     public String getCharset() {
285       return charset.name();
286     }
287 
288     @Override
289     public void setCharset(String charset) {
290       this.charset = Charset.forName(charset);
291     }
292 
293     @Override
294     public int compareTo(byte[] value, int offset, int length) {
295       // Use find() for subsequence match instead of matches() (full sequence
296       // match) to adhere to the principle of least surprise.
297       String tmp;
298       if (length < value.length / 2) {
299         // See HBASE-9428. Make a copy of the relevant part of the byte[],
300         // or the JDK will copy the entire byte[] during String decode
301         tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
302       } else {
303         tmp = new String(value, offset, length, charset);
304       }
305       return pattern.matcher(tmp).find() ? 0 : 1;
306     }
307 
308     @Override
309     public byte[] toByteArray() {
310       ComparatorProtos.RegexStringComparator.Builder builder =
311           ComparatorProtos.RegexStringComparator.newBuilder();
312       builder.setPattern(pattern.pattern());
313       builder.setPatternFlags(pattern.flags());
314       builder.setCharset(charset.name());
315       builder.setEngine(EngineType.JAVA.name());
316       return builder.build().toByteArray();
317     }
318   }
319 
320   /**
321    * Implementation of the Engine interface using Jruby's joni regex engine.
322    * <p>
323    * This engine operates on byte arrays directly so is expected to be more GC
324    * friendly, and reportedly is twice as fast as Java's Pattern engine.
325    * <p>
326    * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and
327    * MULTILINE are supported.
328    */
329   static class JoniRegexEngine implements Engine {
330     private Encoding encoding = UTF8Encoding.INSTANCE;
331     private String regex;
332     private Regex pattern;
333 
334     public JoniRegexEngine(String regex, int flags) {
335       this.regex = regex;
336       byte[] b = Bytes.toBytes(regex);
337       this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
338     }
339 
340     @Override
341     public String getPattern() {
342       return regex;
343     }
344 
345     @Override
346     public int getFlags() {
347       return pattern.getOptions();
348     }
349 
350     @Override
351     public String getCharset() {
352       return encoding.getCharsetName();
353     }
354 
355     @Override
356     public void setCharset(String name) {
357       setEncoding(name);
358     }
359 
360     @Override
361     public int compareTo(byte[] value, int offset, int length) {
362       // Use subsequence match instead of full sequence match to adhere to the
363       // principle of least surprise.
364       Matcher m = pattern.matcher(value);
365       return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
366     }
367 
368     @Override
369     public byte[] toByteArray() {
370       ComparatorProtos.RegexStringComparator.Builder builder =
371           ComparatorProtos.RegexStringComparator.newBuilder();
372         builder.setPattern(regex);
373         builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
374         builder.setCharset(encoding.getCharsetName());
375         builder.setEngine(EngineType.JONI.name());
376         return builder.build().toByteArray();
377     }
378 
379     private int patternToJoniFlags(int flags) {
380       int newFlags = 0;
381       if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
382         newFlags |= Option.IGNORECASE;
383       }
384       if ((flags & Pattern.DOTALL) != 0) {
385         // This does NOT mean Pattern.MULTILINE
386         newFlags |= Option.MULTILINE;
387       }
388       if ((flags & Pattern.MULTILINE) != 0) {
389         // This is what Java 8's Nashorn engine does when using joni and
390         // translating Pattern's MULTILINE flag
391         newFlags &= ~Option.SINGLELINE;
392         newFlags |= Option.NEGATE_SINGLELINE;
393       }
394       return newFlags;
395     }
396 
397     private int joniToPatternFlags(int flags) {
398       int newFlags = 0;
399       if ((flags & Option.IGNORECASE) != 0) {
400         newFlags |= Pattern.CASE_INSENSITIVE;
401       }
402       // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
403       if ((flags & Option.MULTILINE) != 0) {
404         newFlags |= Pattern.DOTALL;
405       }
406       // This means Pattern.MULTILINE. Nice
407       if ((flags & Option.NEGATE_SINGLELINE) != 0) {
408         newFlags |= Pattern.MULTILINE;
409       }
410       return newFlags;
411     }
412 
413     private void setEncoding(String name) {
414       EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
415       if (e != null) {
416         encoding = e.getEncoding();
417       } else {
418         throw new IllegalCharsetNameException(name);
419       }
420     }
421   }
422 }