001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.filter; 020 021import java.nio.charset.Charset; 022import java.nio.charset.IllegalCharsetNameException; 023import java.util.Arrays; 024import java.util.regex.Pattern; 025 026import org.apache.yetus.audience.InterfaceAudience; 027import org.apache.hadoop.hbase.exceptions.DeserializationException; 028import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; 029import org.apache.hadoop.hbase.util.Bytes; 030import org.jcodings.Encoding; 031import org.jcodings.EncodingDB; 032import org.jcodings.specific.UTF8Encoding; 033import org.joni.Matcher; 034import org.joni.Option; 035import org.joni.Regex; 036import org.joni.Syntax; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; 040 041/** 042 * This comparator is for use with {@link CompareFilter} implementations, such 043 * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for 044 * filtering based on the value of a given column. Use it to test if a given 045 * regular expression matches a cell value in the column. 046 * <p> 047 * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator. 048 * <p> 049 * For example: 050 * <p> 051 * <pre> 052 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, 053 * new RegexStringComparator( 054 * // v4 IP address 055 * "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" + 056 * "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" + 057 * "|" + 058 * // v6 IP address 059 * "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" + 060 * "{3}[\\d]{1,3})?)(\\/[0-9]+)?")); 061 * </pre> 062 * <p> 063 * Supports {@link java.util.regex.Pattern} flags as well: 064 * <p> 065 * <pre> 066 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, 067 * new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL)); 068 * </pre> 069 * @see java.util.regex.Pattern 070 */ 071@InterfaceAudience.Public 072@SuppressWarnings("ComparableType") // Should this move to Comparator usage? 073public class RegexStringComparator extends ByteArrayComparable { 074 075 private static final Logger LOG = LoggerFactory.getLogger(RegexStringComparator.class); 076 077 private Engine engine; 078 079 /** Engine implementation type (default=JAVA) */ 080 @InterfaceAudience.Public 081 public enum EngineType { 082 JAVA, 083 JONI 084 } 085 086 /** 087 * Constructor 088 * Adds Pattern.DOTALL to the underlying Pattern 089 * @param expr a valid regular expression 090 */ 091 public RegexStringComparator(String expr) { 092 this(expr, Pattern.DOTALL); 093 } 094 095 /** 096 * Constructor 097 * Adds Pattern.DOTALL to the underlying Pattern 098 * @param expr a valid regular expression 099 * @param engine engine implementation type 100 */ 101 public RegexStringComparator(String expr, EngineType engine) { 102 this(expr, Pattern.DOTALL, engine); 103 } 104 105 /** 106 * Constructor 107 * @param expr a valid regular expression 108 * @param flags java.util.regex.Pattern flags 109 */ 110 public RegexStringComparator(String expr, int flags) { 111 this(expr, flags, EngineType.JAVA); 112 } 113 114 /** 115 * Constructor 116 * @param expr a valid regular expression 117 * @param flags java.util.regex.Pattern flags 118 * @param engine engine implementation type 119 */ 120 public RegexStringComparator(String expr, int flags, EngineType engine) { 121 super(Bytes.toBytes(expr)); 122 switch (engine) { 123 case JAVA: 124 this.engine = new JavaRegexEngine(expr, flags); 125 break; 126 case JONI: 127 this.engine = new JoniRegexEngine(expr, flags); 128 break; 129 } 130 } 131 132 /** 133 * Specifies the {@link Charset} to use to convert the row key to a String. 134 * <p> 135 * The row key needs to be converted to a String in order to be matched 136 * against the regular expression. This method controls which charset is 137 * used to do this conversion. 138 * <p> 139 * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} 140 * is recommended. 141 * @param charset The charset to use. 142 */ 143 public void setCharset(final Charset charset) { 144 engine.setCharset(charset.name()); 145 } 146 147 @Override 148 public int compareTo(byte[] value, int offset, int length) { 149 return engine.compareTo(value, offset, length); 150 } 151 152 /** 153 * @return The comparator serialized using pb 154 */ 155 @Override 156 public byte [] toByteArray() { 157 return engine.toByteArray(); 158 } 159 160 /** 161 * @param pbBytes A pb serialized {@link RegexStringComparator} instance 162 * @return An instance of {@link RegexStringComparator} made from <code>bytes</code> 163 * @throws DeserializationException 164 * @see #toByteArray 165 */ 166 public static RegexStringComparator parseFrom(final byte [] pbBytes) 167 throws DeserializationException { 168 ComparatorProtos.RegexStringComparator proto; 169 try { 170 proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes); 171 } catch (InvalidProtocolBufferException e) { 172 throw new DeserializationException(e); 173 } 174 RegexStringComparator comparator; 175 if (proto.hasEngine()) { 176 EngineType engine = EngineType.valueOf(proto.getEngine()); 177 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(), 178 engine); 179 } else { 180 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags()); 181 } 182 String charset = proto.getCharset(); 183 if (charset.length() > 0) { 184 try { 185 comparator.getEngine().setCharset(charset); 186 } catch (IllegalCharsetNameException e) { 187 LOG.error("invalid charset", e); 188 } 189 } 190 return comparator; 191 } 192 193 /** 194 * @param other 195 * @return true if and only if the fields of the comparator that are serialized 196 * are equal to the corresponding fields in other. Used for testing. 197 */ 198 @Override 199 boolean areSerializedFieldsEqual(ByteArrayComparable other) { 200 if (other == this) return true; 201 if (!(other instanceof RegexStringComparator)) return false; 202 RegexStringComparator comparator = (RegexStringComparator)other; 203 return super.areSerializedFieldsEqual(comparator) 204 && engine.getClass().isInstance(comparator.getEngine()) 205 && engine.getPattern().equals(comparator.getEngine().getPattern()) 206 && engine.getFlags() == comparator.getEngine().getFlags() 207 && engine.getCharset().equals(comparator.getEngine().getCharset()); 208 } 209 210 Engine getEngine() { 211 return engine; 212 } 213 214 /** 215 * This is an internal interface for abstracting access to different regular 216 * expression matching engines. 217 */ 218 static interface Engine { 219 /** 220 * Returns the string representation of the configured regular expression 221 * for matching 222 */ 223 String getPattern(); 224 225 /** 226 * Returns the set of configured match flags, a bit mask that may include 227 * {@link Pattern} flags 228 */ 229 int getFlags(); 230 231 /** 232 * Returns the name of the configured charset 233 */ 234 String getCharset(); 235 236 /** 237 * Set the charset used when matching 238 * @param charset the name of the desired charset for matching 239 */ 240 void setCharset(final String charset); 241 242 /** 243 * Return the serialized form of the configured matcher 244 */ 245 byte [] toByteArray(); 246 247 /** 248 * Match the given input against the configured pattern 249 * @param value the data to be matched 250 * @param offset offset of the data to be matched 251 * @param length length of the data to be matched 252 * @return 0 if a match was made, 1 otherwise 253 */ 254 int compareTo(byte[] value, int offset, int length); 255 } 256 257 /** 258 * Implementation of the Engine interface using Java's Pattern. 259 * <p> 260 * This is the default engine. 261 */ 262 static class JavaRegexEngine implements Engine { 263 private Charset charset = Charset.forName("UTF-8"); 264 private Pattern pattern; 265 266 public JavaRegexEngine(String regex, int flags) { 267 this.pattern = Pattern.compile(regex, flags); 268 } 269 270 @Override 271 public String getPattern() { 272 return pattern.toString(); 273 } 274 275 @Override 276 public int getFlags() { 277 return pattern.flags(); 278 } 279 280 @Override 281 public String getCharset() { 282 return charset.name(); 283 } 284 285 @Override 286 public void setCharset(String charset) { 287 this.charset = Charset.forName(charset); 288 } 289 290 @Override 291 public int compareTo(byte[] value, int offset, int length) { 292 // Use find() for subsequence match instead of matches() (full sequence 293 // match) to adhere to the principle of least surprise. 294 String tmp; 295 if (length < value.length / 2) { 296 // See HBASE-9428. Make a copy of the relevant part of the byte[], 297 // or the JDK will copy the entire byte[] during String decode 298 tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset); 299 } else { 300 tmp = new String(value, offset, length, charset); 301 } 302 return pattern.matcher(tmp).find() ? 0 : 1; 303 } 304 305 @Override 306 public byte[] toByteArray() { 307 ComparatorProtos.RegexStringComparator.Builder builder = 308 ComparatorProtos.RegexStringComparator.newBuilder(); 309 builder.setPattern(pattern.pattern()); 310 builder.setPatternFlags(pattern.flags()); 311 builder.setCharset(charset.name()); 312 builder.setEngine(EngineType.JAVA.name()); 313 return builder.build().toByteArray(); 314 } 315 } 316 317 /** 318 * Implementation of the Engine interface using Jruby's joni regex engine. 319 * <p> 320 * This engine operates on byte arrays directly so is expected to be more GC 321 * friendly, and reportedly is twice as fast as Java's Pattern engine. 322 * <p> 323 * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and 324 * MULTILINE are supported. 325 */ 326 static class JoniRegexEngine implements Engine { 327 private Encoding encoding = UTF8Encoding.INSTANCE; 328 private String regex; 329 private Regex pattern; 330 331 public JoniRegexEngine(String regex, int flags) { 332 this.regex = regex; 333 byte[] b = Bytes.toBytes(regex); 334 this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java); 335 } 336 337 @Override 338 public String getPattern() { 339 return regex; 340 } 341 342 @Override 343 public int getFlags() { 344 return pattern.getOptions(); 345 } 346 347 @Override 348 public String getCharset() { 349 return encoding.getCharsetName(); 350 } 351 352 @Override 353 public void setCharset(String name) { 354 setEncoding(name); 355 } 356 357 @Override 358 public int compareTo(byte[] value, int offset, int length) { 359 // Use subsequence match instead of full sequence match to adhere to the 360 // principle of least surprise. 361 Matcher m = pattern.matcher(value); 362 return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0; 363 } 364 365 @Override 366 public byte[] toByteArray() { 367 ComparatorProtos.RegexStringComparator.Builder builder = 368 ComparatorProtos.RegexStringComparator.newBuilder(); 369 builder.setPattern(regex); 370 builder.setPatternFlags(joniToPatternFlags(pattern.getOptions())); 371 builder.setCharset(encoding.getCharsetName()); 372 builder.setEngine(EngineType.JONI.name()); 373 return builder.build().toByteArray(); 374 } 375 376 private int patternToJoniFlags(int flags) { 377 int newFlags = 0; 378 if ((flags & Pattern.CASE_INSENSITIVE) != 0) { 379 newFlags |= Option.IGNORECASE; 380 } 381 if ((flags & Pattern.DOTALL) != 0) { 382 // This does NOT mean Pattern.MULTILINE 383 newFlags |= Option.MULTILINE; 384 } 385 if ((flags & Pattern.MULTILINE) != 0) { 386 // This is what Java 8's Nashorn engine does when using joni and 387 // translating Pattern's MULTILINE flag 388 newFlags &= ~Option.SINGLELINE; 389 newFlags |= Option.NEGATE_SINGLELINE; 390 } 391 return newFlags; 392 } 393 394 private int joniToPatternFlags(int flags) { 395 int newFlags = 0; 396 if ((flags & Option.IGNORECASE) != 0) { 397 newFlags |= Pattern.CASE_INSENSITIVE; 398 } 399 // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL 400 if ((flags & Option.MULTILINE) != 0) { 401 newFlags |= Pattern.DOTALL; 402 } 403 // This means Pattern.MULTILINE. Nice 404 if ((flags & Option.NEGATE_SINGLELINE) != 0) { 405 newFlags |= Pattern.MULTILINE; 406 } 407 return newFlags; 408 } 409 410 private void setEncoding(String name) { 411 EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name)); 412 if (e != null) { 413 encoding = e.getEncoding(); 414 } else { 415 throw new IllegalCharsetNameException(name); 416 } 417 } 418 } 419}