001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.filter; 019 020import java.nio.charset.Charset; 021import java.nio.charset.IllegalCharsetNameException; 022import java.util.Arrays; 023import java.util.regex.Pattern; 024import org.apache.hadoop.hbase.exceptions.DeserializationException; 025import org.apache.hadoop.hbase.util.Bytes; 026import org.apache.yetus.audience.InterfaceAudience; 027import org.jcodings.Encoding; 028import org.jcodings.EncodingDB; 029import org.jcodings.specific.UTF8Encoding; 030import org.joni.Matcher; 031import org.joni.Option; 032import org.joni.Regex; 033import org.joni.Syntax; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036 037import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; 038 039import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; 040 041/** 042 * This comparator is for use with {@link CompareFilter} implementations, such as {@link RowFilter}, 043 * {@link QualifierFilter}, and {@link ValueFilter}, for filtering based on the value of a given 044 * column. Use it to test if a given regular expression matches a cell value in the column. 045 * <p> 046 * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator. 047 * <p> 048 * For example: 049 * <p> 050 * 051 * <pre> 052 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, new RegexStringComparator( 053 * // v4 IP address 054 * "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" 055 * + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" + "|" + 056 * // v6 IP address 057 * "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" 058 * + "{3}[\\d]{1,3})?)(\\/[0-9]+)?")); 059 * </pre> 060 * <p> 061 * Supports {@link java.util.regex.Pattern} flags as well: 062 * <p> 063 * 064 * <pre> 065 * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, 066 * new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL)); 067 * </pre> 068 * 069 * @see java.util.regex.Pattern 070 */ 071@InterfaceAudience.Public 072@SuppressWarnings("ComparableType") // Should this move to Comparator usage? 073public class RegexStringComparator extends ByteArrayComparable { 074 075 private static final Logger LOG = LoggerFactory.getLogger(RegexStringComparator.class); 076 077 private Engine engine; 078 079 /** Engine implementation type (default=JAVA) */ 080 @InterfaceAudience.Public 081 public enum EngineType { 082 JAVA, 083 JONI 084 } 085 086 /** 087 * Constructor Adds Pattern.DOTALL to the underlying Pattern 088 * @param expr a valid regular expression 089 */ 090 public RegexStringComparator(String expr) { 091 this(expr, Pattern.DOTALL); 092 } 093 094 /** 095 * Constructor Adds Pattern.DOTALL to the underlying Pattern 096 * @param expr a valid regular expression 097 * @param engine engine implementation type 098 */ 099 public RegexStringComparator(String expr, EngineType engine) { 100 this(expr, Pattern.DOTALL, engine); 101 } 102 103 /** 104 * Constructor 105 * @param expr a valid regular expression 106 * @param flags java.util.regex.Pattern flags 107 */ 108 public RegexStringComparator(String expr, int flags) { 109 this(expr, flags, EngineType.JAVA); 110 } 111 112 /** 113 * Constructor 114 * @param expr a valid regular expression 115 * @param flags java.util.regex.Pattern flags 116 * @param engine engine implementation type 117 */ 118 public RegexStringComparator(String expr, int flags, EngineType engine) { 119 super(Bytes.toBytes(expr)); 120 switch (engine) { 121 case JAVA: 122 this.engine = new JavaRegexEngine(expr, flags); 123 break; 124 case JONI: 125 this.engine = new JoniRegexEngine(expr, flags); 126 break; 127 } 128 } 129 130 /** 131 * Specifies the {@link Charset} to use to convert the row key to a String. 132 * <p> 133 * The row key needs to be converted to a String in order to be matched against the regular 134 * expression. This method controls which charset is used to do this conversion. 135 * <p> 136 * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} is recommended. 137 * @param charset The charset to use. 138 */ 139 public void setCharset(final Charset charset) { 140 engine.setCharset(charset.name()); 141 } 142 143 @Override 144 public int compareTo(byte[] value, int offset, int length) { 145 return engine.compareTo(value, offset, length); 146 } 147 148 /** 149 * @return The comparator serialized using pb 150 */ 151 @Override 152 public byte[] toByteArray() { 153 return engine.toByteArray(); 154 } 155 156 /** 157 * @param pbBytes A pb serialized {@link RegexStringComparator} instance 158 * @return An instance of {@link RegexStringComparator} made from <code>bytes</code> n * @see 159 * #toByteArray 160 */ 161 public static RegexStringComparator parseFrom(final byte[] pbBytes) 162 throws DeserializationException { 163 ComparatorProtos.RegexStringComparator proto; 164 try { 165 proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes); 166 } catch (InvalidProtocolBufferException e) { 167 throw new DeserializationException(e); 168 } 169 RegexStringComparator comparator; 170 if (proto.hasEngine()) { 171 EngineType engine = EngineType.valueOf(proto.getEngine()); 172 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(), engine); 173 } else { 174 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags()); 175 } 176 String charset = proto.getCharset(); 177 if (charset.length() > 0) { 178 try { 179 comparator.getEngine().setCharset(charset); 180 } catch (IllegalCharsetNameException e) { 181 LOG.error("invalid charset", e); 182 } 183 } 184 return comparator; 185 } 186 187 /** 188 * n * @return true if and only if the fields of the comparator that are serialized are equal to 189 * the corresponding fields in other. Used for testing. 190 */ 191 @Override 192 boolean areSerializedFieldsEqual(ByteArrayComparable other) { 193 if (other == this) return true; 194 if (!(other instanceof RegexStringComparator)) return false; 195 RegexStringComparator comparator = (RegexStringComparator) other; 196 return super.areSerializedFieldsEqual(comparator) 197 && engine.getClass().isInstance(comparator.getEngine()) 198 && engine.getPattern().equals(comparator.getEngine().getPattern()) 199 && engine.getFlags() == comparator.getEngine().getFlags() 200 && engine.getCharset().equals(comparator.getEngine().getCharset()); 201 } 202 203 Engine getEngine() { 204 return engine; 205 } 206 207 /** 208 * This is an internal interface for abstracting access to different regular expression matching 209 * engines. 210 */ 211 static interface Engine { 212 /** 213 * Returns the string representation of the configured regular expression for matching 214 */ 215 String getPattern(); 216 217 /** 218 * Returns the set of configured match flags, a bit mask that may include {@link Pattern} flags 219 */ 220 int getFlags(); 221 222 /** 223 * Returns the name of the configured charset 224 */ 225 String getCharset(); 226 227 /** 228 * Set the charset used when matching 229 * @param charset the name of the desired charset for matching 230 */ 231 void setCharset(final String charset); 232 233 /** 234 * Return the serialized form of the configured matcher 235 */ 236 byte[] toByteArray(); 237 238 /** 239 * Match the given input against the configured pattern 240 * @param value the data to be matched 241 * @param offset offset of the data to be matched 242 * @param length length of the data to be matched 243 * @return 0 if a match was made, 1 otherwise 244 */ 245 int compareTo(byte[] value, int offset, int length); 246 } 247 248 /** 249 * Implementation of the Engine interface using Java's Pattern. 250 * <p> 251 * This is the default engine. 252 */ 253 static class JavaRegexEngine implements Engine { 254 private Charset charset = Charset.forName("UTF-8"); 255 private Pattern pattern; 256 257 public JavaRegexEngine(String regex, int flags) { 258 this.pattern = Pattern.compile(regex, flags); 259 } 260 261 @Override 262 public String getPattern() { 263 return pattern.toString(); 264 } 265 266 @Override 267 public int getFlags() { 268 return pattern.flags(); 269 } 270 271 @Override 272 public String getCharset() { 273 return charset.name(); 274 } 275 276 @Override 277 public void setCharset(String charset) { 278 this.charset = Charset.forName(charset); 279 } 280 281 @Override 282 public int compareTo(byte[] value, int offset, int length) { 283 // Use find() for subsequence match instead of matches() (full sequence 284 // match) to adhere to the principle of least surprise. 285 String tmp; 286 if (length < value.length / 2) { 287 // See HBASE-9428. Make a copy of the relevant part of the byte[], 288 // or the JDK will copy the entire byte[] during String decode 289 tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset); 290 } else { 291 tmp = new String(value, offset, length, charset); 292 } 293 return pattern.matcher(tmp).find() ? 0 : 1; 294 } 295 296 @Override 297 public byte[] toByteArray() { 298 ComparatorProtos.RegexStringComparator.Builder builder = 299 ComparatorProtos.RegexStringComparator.newBuilder(); 300 builder.setPattern(pattern.pattern()); 301 builder.setPatternFlags(pattern.flags()); 302 builder.setCharset(charset.name()); 303 builder.setEngine(EngineType.JAVA.name()); 304 return builder.build().toByteArray(); 305 } 306 } 307 308 /** 309 * Implementation of the Engine interface using Jruby's joni regex engine. 310 * <p> 311 * This engine operates on byte arrays directly so is expected to be more GC friendly, and 312 * reportedly is twice as fast as Java's Pattern engine. 313 * <p> 314 * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and MULTILINE are supported. 315 */ 316 static class JoniRegexEngine implements Engine { 317 private Encoding encoding = UTF8Encoding.INSTANCE; 318 private String regex; 319 private Regex pattern; 320 321 public JoniRegexEngine(String regex, int flags) { 322 this.regex = regex; 323 byte[] b = Bytes.toBytes(regex); 324 this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java); 325 } 326 327 @Override 328 public String getPattern() { 329 return regex; 330 } 331 332 @Override 333 public int getFlags() { 334 return pattern.getOptions(); 335 } 336 337 @Override 338 public String getCharset() { 339 return encoding.getCharsetName(); 340 } 341 342 @Override 343 public void setCharset(String name) { 344 setEncoding(name); 345 } 346 347 @Override 348 public int compareTo(byte[] value, int offset, int length) { 349 // Use subsequence match instead of full sequence match to adhere to the 350 // principle of least surprise. 351 Matcher m = pattern.matcher(value); 352 return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0; 353 } 354 355 @Override 356 public byte[] toByteArray() { 357 ComparatorProtos.RegexStringComparator.Builder builder = 358 ComparatorProtos.RegexStringComparator.newBuilder(); 359 builder.setPattern(regex); 360 builder.setPatternFlags(joniToPatternFlags(pattern.getOptions())); 361 builder.setCharset(encoding.getCharsetName()); 362 builder.setEngine(EngineType.JONI.name()); 363 return builder.build().toByteArray(); 364 } 365 366 private int patternToJoniFlags(int flags) { 367 int newFlags = 0; 368 if ((flags & Pattern.CASE_INSENSITIVE) != 0) { 369 newFlags |= Option.IGNORECASE; 370 } 371 if ((flags & Pattern.DOTALL) != 0) { 372 // This does NOT mean Pattern.MULTILINE 373 newFlags |= Option.MULTILINE; 374 } 375 if ((flags & Pattern.MULTILINE) != 0) { 376 // This is what Java 8's Nashorn engine does when using joni and 377 // translating Pattern's MULTILINE flag 378 newFlags &= ~Option.SINGLELINE; 379 newFlags |= Option.NEGATE_SINGLELINE; 380 } 381 return newFlags; 382 } 383 384 private int joniToPatternFlags(int flags) { 385 int newFlags = 0; 386 if ((flags & Option.IGNORECASE) != 0) { 387 newFlags |= Pattern.CASE_INSENSITIVE; 388 } 389 // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL 390 if ((flags & Option.MULTILINE) != 0) { 391 newFlags |= Pattern.DOTALL; 392 } 393 // This means Pattern.MULTILINE. Nice 394 if ((flags & Option.NEGATE_SINGLELINE) != 0) { 395 newFlags |= Pattern.MULTILINE; 396 } 397 return newFlags; 398 } 399 400 private void setEncoding(String name) { 401 EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name)); 402 if (e != null) { 403 encoding = e.getEncoding(); 404 } else { 405 throw new IllegalCharsetNameException(name); 406 } 407 } 408 } 409}