001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.filter; 019 020import java.nio.charset.Charset; 021import java.nio.charset.IllegalCharsetNameException; 022import java.util.Arrays; 023import java.util.regex.Pattern; 024import org.apache.hadoop.hbase.exceptions.DeserializationException; 025import org.apache.hadoop.hbase.util.Bytes; 026import org.apache.yetus.audience.InterfaceAudience; 027import org.jcodings.Encoding; 028import org.jcodings.EncodingDB; 029import org.jcodings.specific.NonStrictUTF8Encoding; 030import org.joni.Matcher; 031import org.joni.Option; 032import org.joni.Regex; 033import org.joni.Syntax; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036 037import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; 038 039import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; 040 041/** 042 * This comparator is for use with {@link CompareFilter} implementations, such as {@link RowFilter}, 043 * {@link QualifierFilter}, and {@link ValueFilter}, for filtering based on the value of a given 044 * column. Use it to test if a given regular expression matches a cell value in the column. 045 * <p> 046 * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator. 047 * <p> 048 * For example: 049 * <p> 050 * 051 * <pre> 052 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, new RegexStringComparator( 053 * // v4 IP address 054 * "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" 055 * + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" + "|" + 056 * // v6 IP address 057 * "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" 058 * + "{3}[\\d]{1,3})?)(\\/[0-9]+)?")); 059 * </pre> 060 * <p> 061 * Supports {@link java.util.regex.Pattern} flags as well: 062 * <p> 063 * 064 * <pre> 065 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, 066 * new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL)); 067 * </pre> 068 * 069 * @see java.util.regex.Pattern 070 */ 071@InterfaceAudience.Public 072@SuppressWarnings("ComparableType") // Should this move to Comparator usage? 073public class RegexStringComparator extends ByteArrayComparable { 074 075 private static final Logger LOG = LoggerFactory.getLogger(RegexStringComparator.class); 076 077 private Engine engine; 078 079 /** Engine implementation type (default=JAVA) */ 080 @InterfaceAudience.Public 081 public enum EngineType { 082 JAVA, 083 JONI 084 } 085 086 /** 087 * Constructor Adds Pattern.DOTALL to the underlying Pattern 088 * @param expr a valid regular expression 089 */ 090 public RegexStringComparator(String expr) { 091 this(expr, Pattern.DOTALL); 092 } 093 094 /** 095 * Constructor Adds Pattern.DOTALL to the underlying Pattern 096 * @param expr a valid regular expression 097 * @param engine engine implementation type 098 */ 099 public RegexStringComparator(String expr, EngineType engine) { 100 this(expr, Pattern.DOTALL, engine); 101 } 102 103 /** 104 * Constructor 105 * @param expr a valid regular expression 106 * @param flags java.util.regex.Pattern flags 107 */ 108 public RegexStringComparator(String expr, int flags) { 109 this(expr, flags, EngineType.JAVA); 110 } 111 112 /** 113 * Constructor 114 * @param expr a valid regular expression 115 * @param flags java.util.regex.Pattern flags 116 * @param engine engine implementation type 117 */ 118 public RegexStringComparator(String expr, int flags, EngineType engine) { 119 super(Bytes.toBytes(expr)); 120 switch (engine) { 121 case JAVA: 122 this.engine = new JavaRegexEngine(expr, flags); 123 break; 124 case JONI: 125 this.engine = new JoniRegexEngine(expr, flags); 126 break; 127 } 128 } 129 130 /** 131 * Specifies the {@link Charset} to use to convert the row key to a String. 132 * <p> 133 * The row key needs to be converted to a String in order to be matched against the regular 134 * expression. This method controls which charset is used to do this conversion. 135 * <p> 136 * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} is recommended. 137 * @param charset The charset to use. 138 */ 139 public void setCharset(final Charset charset) { 140 engine.setCharset(charset.name()); 141 } 142 143 @Override 144 public int compareTo(byte[] value, int offset, int length) { 145 return engine.compareTo(value, offset, length); 146 } 147 148 /** Returns The comparator serialized using pb */ 149 @Override 150 public byte[] toByteArray() { 151 return engine.toByteArray(); 152 } 153 154 /** 155 * Parse a serialized representation of {@link RegexStringComparator} 156 * @param pbBytes A pb serialized {@link RegexStringComparator} instance 157 * @return An instance of {@link RegexStringComparator} made from <code>bytes</code> 158 * @throws DeserializationException if an error occurred 159 * @see #toByteArray 160 */ 161 public static RegexStringComparator parseFrom(final byte[] pbBytes) 162 throws DeserializationException { 163 ComparatorProtos.RegexStringComparator proto; 164 try { 165 proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes); 166 } catch (InvalidProtocolBufferException e) { 167 throw new DeserializationException(e); 168 } 169 RegexStringComparator comparator; 170 if (proto.hasEngine()) { 171 EngineType engine = EngineType.valueOf(proto.getEngine()); 172 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(), engine); 173 } else { 174 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags()); 175 } 176 String charset = proto.getCharset(); 177 if (charset.length() > 0) { 178 try { 179 comparator.getEngine().setCharset(charset); 180 } catch (IllegalCharsetNameException e) { 181 LOG.error("invalid charset", e); 182 } 183 } 184 return comparator; 185 } 186 187 /** 188 * Returns true if and only if the fields of the comparator that are serialized are equal to the 189 * corresponding fields in other. Used for testing. 190 */ 191 @Override 192 boolean areSerializedFieldsEqual(ByteArrayComparable other) { 193 if (other == this) { 194 return true; 195 } 196 if (!(other instanceof RegexStringComparator)) { 197 return false; 198 } 199 RegexStringComparator comparator = (RegexStringComparator) other; 200 return super.areSerializedFieldsEqual(comparator) 201 && engine.getClass().isInstance(comparator.getEngine()) 202 && engine.getPattern().equals(comparator.getEngine().getPattern()) 203 && engine.getFlags() == comparator.getEngine().getFlags() 204 && engine.getCharset().equals(comparator.getEngine().getCharset()); 205 } 206 207 Engine getEngine() { 208 return engine; 209 } 210 211 /** 212 * This is an internal interface for abstracting access to different regular expression matching 213 * engines. 214 */ 215 static interface Engine { 216 /** 217 * Returns the string representation of the configured regular expression for matching 218 */ 219 String getPattern(); 220 221 /** 222 * Returns the set of configured match flags, a bit mask that may include {@link Pattern} flags 223 */ 224 int getFlags(); 225 226 /** 227 * Returns the name of the configured charset 228 */ 229 String getCharset(); 230 231 /** 232 * Set the charset used when matching 233 * @param charset the name of the desired charset for matching 234 */ 235 void setCharset(final String charset); 236 237 /** 238 * Return the serialized form of the configured matcher 239 */ 240 byte[] toByteArray(); 241 242 /** 243 * Match the given input against the configured pattern 244 * @param value the data to be matched 245 * @param offset offset of the data to be matched 246 * @param length length of the data to be matched 247 * @return 0 if a match was made, 1 otherwise 248 */ 249 int compareTo(byte[] value, int offset, int length); 250 } 251 252 /** 253 * Implementation of the Engine interface using Java's Pattern. 254 * <p> 255 * This is the default engine. 256 */ 257 static class JavaRegexEngine implements Engine { 258 private Charset charset = Charset.forName("UTF-8"); 259 private Pattern pattern; 260 261 public JavaRegexEngine(String regex, int flags) { 262 this.pattern = Pattern.compile(regex, flags); 263 } 264 265 @Override 266 public String getPattern() { 267 return pattern.toString(); 268 } 269 270 @Override 271 public int getFlags() { 272 return pattern.flags(); 273 } 274 275 @Override 276 public String getCharset() { 277 return charset.name(); 278 } 279 280 @Override 281 public void setCharset(String charset) { 282 this.charset = Charset.forName(charset); 283 } 284 285 @Override 286 public int compareTo(byte[] value, int offset, int length) { 287 // Use find() for subsequence match instead of matches() (full sequence 288 // match) to adhere to the principle of least surprise. 289 String tmp; 290 if (length < value.length / 2) { 291 // See HBASE-9428. Make a copy of the relevant part of the byte[], 292 // or the JDK will copy the entire byte[] during String decode 293 tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset); 294 } else { 295 tmp = new String(value, offset, length, charset); 296 } 297 return pattern.matcher(tmp).find() ? 0 : 1; 298 } 299 300 @Override 301 public byte[] toByteArray() { 302 ComparatorProtos.RegexStringComparator.Builder builder = 303 ComparatorProtos.RegexStringComparator.newBuilder(); 304 builder.setPattern(pattern.pattern()); 305 builder.setPatternFlags(pattern.flags()); 306 builder.setCharset(charset.name()); 307 builder.setEngine(EngineType.JAVA.name()); 308 return builder.build().toByteArray(); 309 } 310 } 311 312 /** 313 * Implementation of the Engine interface using Jruby's joni regex engine. 314 * <p> 315 * This engine operates on byte arrays directly so is expected to be more GC friendly, and 316 * reportedly is twice as fast as Java's Pattern engine. 317 * <p> 318 * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and MULTILINE are supported. 319 */ 320 static class JoniRegexEngine implements Engine { 321 // When using UTF8Encoding, an infinite loop can occur if an invalid UTF8 is encountered. 322 // Use NonStrictUTF8Encoding instead of UTF8Encoding to avoid the issue. 323 private Encoding encoding = NonStrictUTF8Encoding.INSTANCE; 324 private String regex; 325 private Regex pattern; 326 327 public JoniRegexEngine(String regex, int flags) { 328 this.regex = regex; 329 byte[] b = Bytes.toBytes(regex); 330 this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java); 331 } 332 333 @Override 334 public String getPattern() { 335 return regex; 336 } 337 338 @Override 339 public int getFlags() { 340 return pattern.getOptions(); 341 } 342 343 @Override 344 public String getCharset() { 345 return encoding.getCharsetName(); 346 } 347 348 @Override 349 public void setCharset(String name) { 350 setEncoding(name); 351 } 352 353 @Override 354 public int compareTo(byte[] value, int offset, int length) { 355 // Use subsequence match instead of full sequence match to adhere to the 356 // principle of least surprise. 357 Matcher m = pattern.matcher(value); 358 return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0; 359 } 360 361 @Override 362 public byte[] toByteArray() { 363 ComparatorProtos.RegexStringComparator.Builder builder = 364 ComparatorProtos.RegexStringComparator.newBuilder(); 365 builder.setPattern(regex); 366 builder.setPatternFlags(joniToPatternFlags(pattern.getOptions())); 367 builder.setCharset(encoding.getCharsetName()); 368 builder.setEngine(EngineType.JONI.name()); 369 return builder.build().toByteArray(); 370 } 371 372 private int patternToJoniFlags(int flags) { 373 int newFlags = 0; 374 if ((flags & Pattern.CASE_INSENSITIVE) != 0) { 375 newFlags |= Option.IGNORECASE; 376 } 377 if ((flags & Pattern.DOTALL) != 0) { 378 // This does NOT mean Pattern.MULTILINE 379 newFlags |= Option.MULTILINE; 380 } 381 if ((flags & Pattern.MULTILINE) != 0) { 382 // This is what Java 8's Nashorn engine does when using joni and 383 // translating Pattern's MULTILINE flag 384 newFlags &= ~Option.SINGLELINE; 385 newFlags |= Option.NEGATE_SINGLELINE; 386 } 387 return newFlags; 388 } 389 390 private int joniToPatternFlags(int flags) { 391 int newFlags = 0; 392 if ((flags & Option.IGNORECASE) != 0) { 393 newFlags |= Pattern.CASE_INSENSITIVE; 394 } 395 // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL 396 if ((flags & Option.MULTILINE) != 0) { 397 newFlags |= Pattern.DOTALL; 398 } 399 // This means Pattern.MULTILINE. Nice 400 if ((flags & Option.NEGATE_SINGLELINE) != 0) { 401 newFlags |= Pattern.MULTILINE; 402 } 403 return newFlags; 404 } 405 406 private void setEncoding(String name) { 407 EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name)); 408 if (e != null) { 409 encoding = e.getEncoding(); 410 } else { 411 throw new IllegalCharsetNameException(name); 412 } 413 } 414 } 415}