View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.filter;
20  
21  import com.google.protobuf.InvalidProtocolBufferException;
22  
23  import java.nio.charset.Charset;
24  import java.nio.charset.IllegalCharsetNameException;
25  import java.util.Arrays;
26  import java.util.regex.Pattern;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.hbase.classification.InterfaceAudience;
31  import org.apache.hadoop.hbase.classification.InterfaceStability;
32  import org.apache.hadoop.hbase.exceptions.DeserializationException;
33  import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos;
34  import org.apache.hadoop.hbase.util.Bytes;
35  
36  import org.jcodings.Encoding;
37  import org.jcodings.EncodingDB;
38  import org.jcodings.specific.UTF8Encoding;
39  import org.joni.Matcher;
40  import org.joni.Option;
41  import org.joni.Regex;
42  import org.joni.Syntax;
43  
44  /**
45   * This comparator is for use with {@link CompareFilter} implementations, such
46   * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for
47   * filtering based on the value of a given column. Use it to test if a given
48   * regular expression matches a cell value in the column.
49   * <p>
50   * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator.
51   * <p>
52   * For example:
53   * <p>
54   * <pre>
55   * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
56   *     new RegexStringComparator(
57   *       // v4 IP address
58   *       "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" +
59   *         "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" +
60   *         "|" +
61   *       // v6 IP address
62   *       "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" +
63   *         "{3}[\\d]{1,3})?)(\\/[0-9]+)?"));
64   * </pre>
65   * <p>
66   * Supports {@link java.util.regex.Pattern} flags as well:
67   * <p>
68   * <pre>
69   * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
70   *     new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
71   * </pre>
72   * @see java.util.regex.Pattern
73   */
74  @InterfaceAudience.Public
75  @InterfaceStability.Stable
76  public class RegexStringComparator extends ByteArrayComparable {
77  
78    private static final Log LOG = LogFactory.getLog(RegexStringComparator.class);
79  
80    private Engine engine;
81  
82    /** Engine implementation type (default=JAVA) */
83    public enum EngineType {
84      JAVA,
85      JONI
86    }
87  
88    /**
89     * Constructor
90     * Adds Pattern.DOTALL to the underlying Pattern
91     * @param expr a valid regular expression
92     */
93    public RegexStringComparator(String expr) {
94      this(expr, Pattern.DOTALL);
95    }
96  
97    /**
98     * Constructor
99     * Adds Pattern.DOTALL to the underlying Pattern
100    * @param expr a valid regular expression
101    * @param engine engine implementation type
102    */
103   public RegexStringComparator(String expr, EngineType engine) {
104     this(expr, Pattern.DOTALL, engine);
105   }
106 
107   /**
108    * Constructor
109    * @param expr a valid regular expression
110    * @param flags java.util.regex.Pattern flags
111    */
112   public RegexStringComparator(String expr, int flags) {
113     this(expr, flags, EngineType.JAVA);
114   }
115 
116   /**
117    * Constructor
118    * @param expr a valid regular expression
119    * @param flags java.util.regex.Pattern flags
120    * @param engine engine implementation type
121    */
122   public RegexStringComparator(String expr, int flags, EngineType engine) {
123     super(Bytes.toBytes(expr));
124     switch (engine) {
125       case JAVA:
126         this.engine = new JavaRegexEngine(expr, flags);
127         break;
128       case JONI:
129         this.engine = new JoniRegexEngine(expr, flags);
130         break;
131     }
132   }
133 
134   /**
135    * Specifies the {@link Charset} to use to convert the row key to a String.
136    * <p>
137    * The row key needs to be converted to a String in order to be matched
138    * against the regular expression.  This method controls which charset is
139    * used to do this conversion.
140    * <p>
141    * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1}
142    * is recommended.
143    * @param charset The charset to use.
144    */
145   public void setCharset(final Charset charset) {
146     engine.setCharset(charset.name());
147   }
148 
149   @Override
150   public int compareTo(byte[] value, int offset, int length) {
151     return engine.compareTo(value, offset, length);
152   }
153 
154   /**
155    * @return The comparator serialized using pb
156    */
157   public byte [] toByteArray() {
158     return engine.toByteArray();
159   }
160 
161   /**
162    * @param pbBytes A pb serialized {@link RegexStringComparator} instance
163    * @return An instance of {@link RegexStringComparator} made from <code>bytes</code>
164    * @throws DeserializationException
165    * @see #toByteArray
166    */
167   public static RegexStringComparator parseFrom(final byte [] pbBytes)
168   throws DeserializationException {
169     ComparatorProtos.RegexStringComparator proto;
170     try {
171       proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
172     } catch (InvalidProtocolBufferException e) {
173       throw new DeserializationException(e);
174     }
175     RegexStringComparator comparator;
176     if (proto.hasEngine()) {
177       EngineType engine = EngineType.valueOf(proto.getEngine());
178       comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(),
179         engine);      
180     } else {
181       comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
182     }
183     String charset = proto.getCharset();
184     if (charset.length() > 0) {
185       try {
186         comparator.getEngine().setCharset(charset);
187       } catch (IllegalCharsetNameException e) {
188         LOG.error("invalid charset", e);
189       }
190     }
191     return comparator;
192   }
193 
194   /**
195    * @param other
196    * @return true if and only if the fields of the comparator that are serialized
197    * are equal to the corresponding fields in other.  Used for testing.
198    */
199   boolean areSerializedFieldsEqual(ByteArrayComparable other) {
200     if (other == this) return true;
201     if (!(other instanceof RegexStringComparator)) return false;
202     RegexStringComparator comparator = (RegexStringComparator)other;
203     return super.areSerializedFieldsEqual(comparator)
204       && engine.getClass().isInstance(comparator.getEngine())
205       && engine.getPattern().equals(comparator.getEngine().getPattern())
206       && engine.getFlags() == comparator.getEngine().getFlags()
207       && engine.getCharset().equals(comparator.getEngine().getCharset());
208   }
209 
210   Engine getEngine() {
211     return engine;
212   }
213 
214   /**
215    * This is an internal interface for abstracting access to different regular
216    * expression matching engines. 
217    */
218   static interface Engine {
219     /**
220      * Returns the string representation of the configured regular expression
221      * for matching
222      */
223     String getPattern();
224     
225     /**
226      * Returns the set of configured match flags, a bit mask that may include
227      * {@link Pattern} flags
228      */
229     int getFlags();
230 
231     /**
232      * Returns the name of the configured charset
233      */
234     String getCharset();
235 
236     /**
237      * Set the charset used when matching
238      * @param charset the name of the desired charset for matching
239      */
240     void setCharset(final String charset);
241 
242     /**
243      * Return the serialized form of the configured matcher
244      */
245     byte [] toByteArray();
246 
247     /**
248      * Match the given input against the configured pattern
249      * @param value the data to be matched
250      * @param offset offset of the data to be matched
251      * @param length length of the data to be matched
252      * @return 0 if a match was made, 1 otherwise
253      */
254     int compareTo(byte[] value, int offset, int length);
255   }
256 
257   /**
258    * Implementation of the Engine interface using Java's Pattern.
259    * <p>
260    * This is the default engine.
261    */
262   static class JavaRegexEngine implements Engine {
263     private Charset charset = Charset.forName("UTF-8");
264     private Pattern pattern;
265 
266     public JavaRegexEngine(String regex, int flags) {
267       this.pattern = Pattern.compile(regex, flags);
268     }
269 
270     @Override
271     public String getPattern() {
272       return pattern.toString();
273     }
274 
275     @Override
276     public int getFlags() {
277       return pattern.flags();
278     }
279 
280     @Override
281     public String getCharset() {
282       return charset.name();
283     }
284 
285     @Override
286     public void setCharset(String charset) {
287       this.charset = Charset.forName(charset);
288     }
289 
290     @Override
291     public int compareTo(byte[] value, int offset, int length) {
292       // Use find() for subsequence match instead of matches() (full sequence
293       // match) to adhere to the principle of least surprise.
294       String tmp;
295       if (length < value.length / 2) {
296         // See HBASE-9428. Make a copy of the relevant part of the byte[],
297         // or the JDK will copy the entire byte[] during String decode
298         tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
299       } else {
300         tmp = new String(value, offset, length, charset);
301       }
302       return pattern.matcher(tmp).find() ? 0 : 1;
303     }
304 
305     @Override
306     public byte[] toByteArray() {
307       ComparatorProtos.RegexStringComparator.Builder builder =
308           ComparatorProtos.RegexStringComparator.newBuilder();
309       builder.setPattern(pattern.pattern());
310       builder.setPatternFlags(pattern.flags());
311       builder.setCharset(charset.name());
312       builder.setEngine(EngineType.JAVA.name());
313       return builder.build().toByteArray();
314     }
315   }
316 
317   /**
318    * Implementation of the Engine interface using Jruby's joni regex engine.
319    * <p>
320    * This engine operates on byte arrays directly so is expected to be more GC
321    * friendly, and reportedly is twice as fast as Java's Pattern engine.
322    * <p>
323    * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and
324    * MULTILINE are supported.
325    */
326   static class JoniRegexEngine implements Engine {
327     private Encoding encoding = UTF8Encoding.INSTANCE;
328     private String regex;
329     private Regex pattern;
330 
331     public JoniRegexEngine(String regex, int flags) {
332       this.regex = regex;
333       byte[] b = Bytes.toBytes(regex);
334       this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
335     }
336 
337     @Override
338     public String getPattern() {
339       return regex;
340     }
341 
342     @Override
343     public int getFlags() {
344       return pattern.getOptions();
345     }
346 
347     @Override
348     public String getCharset() {
349       return encoding.getCharsetName();
350     }
351 
352     @Override
353     public void setCharset(String name) {
354       setEncoding(name);
355     }
356 
357     @Override
358     public int compareTo(byte[] value, int offset, int length) {
359       // Use subsequence match instead of full sequence match to adhere to the
360       // principle of least surprise.
361       Matcher m = pattern.matcher(value);
362       return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
363     }
364 
365     @Override
366     public byte[] toByteArray() {
367       ComparatorProtos.RegexStringComparator.Builder builder =
368           ComparatorProtos.RegexStringComparator.newBuilder();
369         builder.setPattern(regex);
370         builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
371         builder.setCharset(encoding.getCharsetName());
372         builder.setEngine(EngineType.JONI.name());
373         return builder.build().toByteArray();
374     }
375 
376     private int patternToJoniFlags(int flags) {
377       int newFlags = 0;
378       if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
379         newFlags |= Option.IGNORECASE;
380       }
381       if ((flags & Pattern.DOTALL) != 0) {
382         // This does NOT mean Pattern.MULTILINE
383         newFlags |= Option.MULTILINE;
384       }
385       if ((flags & Pattern.MULTILINE) != 0) {
386         // This is what Java 8's Nashorn engine does when using joni and
387         // translating Pattern's MULTILINE flag
388         newFlags &= ~Option.SINGLELINE;
389         newFlags |= Option.NEGATE_SINGLELINE;
390       }
391       return newFlags;
392     }
393 
394     private int joniToPatternFlags(int flags) {
395       int newFlags = 0;
396       if ((flags & Option.IGNORECASE) != 0) {
397         newFlags |= Pattern.CASE_INSENSITIVE;
398       }
399       // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
400       if ((flags & Option.MULTILINE) != 0) {
401         newFlags |= Pattern.DOTALL;
402       }
403       // This means Pattern.MULTILINE. Nice
404       if ((flags & Option.NEGATE_SINGLELINE) != 0) {
405         newFlags |= Pattern.MULTILINE;
406       }
407       return newFlags;
408     }
409 
410     private void setEncoding(String name) {
411       EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
412       if (e != null) {
413         encoding = e.getEncoding();
414       } else {
415         throw new IllegalCharsetNameException(name);
416       }    
417     }
418   }
419 }