View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.filter;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.List;
23  
24  import org.apache.hadoop.hbase.Cell;
25  import org.apache.hadoop.hbase.KeyValueUtil;
26  import org.apache.hadoop.hbase.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.classification.InterfaceStability;
28  import org.apache.hadoop.hbase.exceptions.DeserializationException;
29  import org.apache.hadoop.hbase.protobuf.generated.FilterProtos;
30  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.BytesBytesPair;
31  import org.apache.hadoop.hbase.util.ByteStringer;
32  import org.apache.hadoop.hbase.util.Bytes;
33  import org.apache.hadoop.hbase.util.Pair;
34  
35  import com.google.common.annotations.VisibleForTesting;
36  import com.google.protobuf.InvalidProtocolBufferException;
37  
38  /**
39   * Filters data based on fuzzy row key. Performs fast-forwards during scanning.
40   * It takes pairs (row key, fuzzy info) to match row keys. Where fuzzy info is
41   * a byte array with 0 or 1 as its values:
42   * <ul>
43   *   <li>
44   *     0 - means that this byte in provided row key is fixed, i.e. row key's byte at same position
45   *         must match
46   *   </li>
47   *   <li>
48   *     1 - means that this byte in provided row key is NOT fixed, i.e. row key's byte at this
49   *         position can be different from the one in provided row key
50   *   </li>
51   * </ul>
52   *
53   *
54   * Example:
55   * Let's assume row key format is userId_actionId_year_month. Length of userId is fixed
56   * and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively.
57   *
58   * Let's assume that we need to fetch all users that performed certain action (encoded as "99")
59   * in Jan of any year. Then the pair (row key, fuzzy info) would be the following:
60   * row key = "????_99_????_01" (one can use any value instead of "?")
61   * fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00"
62   *
63   * I.e. fuzzy info tells the matching mask is "????_99_????_01", where at ? can be any value.
64   *
65   */
66  @InterfaceAudience.Public
67  @InterfaceStability.Evolving
68  public class FuzzyRowFilter extends FilterBase {
69    private List<Pair<byte[], byte[]>> fuzzyKeysData;
70    private boolean done = false;
71  
72    public FuzzyRowFilter(List<Pair<byte[], byte[]>> fuzzyKeysData) {
73      Pair<byte[], byte[]> p;
74      for (int i = 0; i < fuzzyKeysData.size(); i++) {
75        p = fuzzyKeysData.get(i);
76        if (p.getFirst().length != p.getSecond().length) {
77          Pair<String, String> readable = new Pair<String, String>(
78            Bytes.toStringBinary(p.getFirst()),
79            Bytes.toStringBinary(p.getSecond()));
80          throw new IllegalArgumentException("Fuzzy pair lengths do not match: " + readable);
81        }
82      }
83      this.fuzzyKeysData = fuzzyKeysData;
84    }
85  
86    // TODO: possible improvement: save which fuzzy row key to use when providing a hint
87    @Override
88    public ReturnCode filterKeyValue(Cell c) {
89      // assigning "worst" result first and looking for better options
90      SatisfiesCode bestOption = SatisfiesCode.NO_NEXT;
91      for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
92        SatisfiesCode satisfiesCode = satisfies(isReversed(), c.getRowArray(), c.getRowOffset(),
93          c.getRowLength(), fuzzyData.getFirst(), fuzzyData.getSecond());
94        if (satisfiesCode == SatisfiesCode.YES) {
95          return ReturnCode.INCLUDE;
96        }
97  
98        if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) {
99          bestOption = SatisfiesCode.NEXT_EXISTS;
100       }
101     }
102 
103     if (bestOption == SatisfiesCode.NEXT_EXISTS) {
104       return ReturnCode.SEEK_NEXT_USING_HINT;
105     }
106 
107     // the only unhandled SatisfiesCode is NO_NEXT, i.e. we are done
108     done = true;
109     return ReturnCode.NEXT_ROW;
110   }
111 
112   @Override
113   public Cell getNextCellHint(Cell currentCell) {
114     byte[] nextRowKey = null;
115     // Searching for the "smallest" row key that satisfies at least one fuzzy row key
116     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
117       byte[] nextRowKeyCandidate = getNextForFuzzyRule(isReversed(), currentCell.getRowArray(),
118           currentCell.getRowOffset(), currentCell.getRowLength(), fuzzyData.getFirst(),
119         fuzzyData.getSecond());
120       if (nextRowKeyCandidate == null) {
121         continue;
122       }
123       if (nextRowKey == null ||
124         (reversed && Bytes.compareTo(nextRowKeyCandidate, nextRowKey) > 0) ||
125         (!reversed && Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0)) {
126         nextRowKey = nextRowKeyCandidate;
127       }
128     }
129 
130     if (!reversed && nextRowKey == null) {
131       // Should never happen for forward scanners; logic in filterKeyValue should return NO_NEXT.
132       // Can happen in reversed scanner when currentKV is just before the next possible match; in
133       // this case, fall back on scanner simply calling KeyValueHeap.next()
134       // TODO: is there a better way than throw exception? (stop the scanner?)
135       throw new IllegalStateException("No next row key that satisfies fuzzy exists when"
136           + " getNextKeyHint() is invoked." + " Filter: " + this.toString() + " currentKV: "
137           + currentCell);
138     }
139 
140     return nextRowKey == null ? null : KeyValueUtil.createFirstOnRow(nextRowKey);
141   }
142 
143   @Override
144   public boolean filterAllRemaining() {
145     return done;
146   }
147 
148   /**
149    * @return The filter serialized using pb
150    */
151   public byte [] toByteArray() {
152     FilterProtos.FuzzyRowFilter.Builder builder =
153       FilterProtos.FuzzyRowFilter.newBuilder();
154     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
155       BytesBytesPair.Builder bbpBuilder = BytesBytesPair.newBuilder();
156       bbpBuilder.setFirst(ByteStringer.wrap(fuzzyData.getFirst()));
157       bbpBuilder.setSecond(ByteStringer.wrap(fuzzyData.getSecond()));
158       builder.addFuzzyKeysData(bbpBuilder);
159     }
160     return builder.build().toByteArray();
161   }
162 
163   /**
164    * @param pbBytes A pb serialized {@link FuzzyRowFilter} instance
165    * @return An instance of {@link FuzzyRowFilter} made from <code>bytes</code>
166    * @throws DeserializationException
167    * @see #toByteArray
168    */
169   public static FuzzyRowFilter parseFrom(final byte [] pbBytes)
170   throws DeserializationException {
171     FilterProtos.FuzzyRowFilter proto;
172     try {
173       proto = FilterProtos.FuzzyRowFilter.parseFrom(pbBytes);
174     } catch (InvalidProtocolBufferException e) {
175       throw new DeserializationException(e);
176     }
177     int count = proto.getFuzzyKeysDataCount();
178     ArrayList<Pair<byte[], byte[]>> fuzzyKeysData= new ArrayList<Pair<byte[], byte[]>>(count);
179     for (int i = 0; i < count; ++i) {
180       BytesBytesPair current = proto.getFuzzyKeysData(i);
181       byte[] keyBytes = current.getFirst().toByteArray();
182       byte[] keyMeta = current.getSecond().toByteArray();
183       fuzzyKeysData.add(new Pair<byte[], byte[]>(keyBytes, keyMeta));
184     }
185     return new FuzzyRowFilter(fuzzyKeysData);
186   }
187 
188   @Override
189   public String toString() {
190     final StringBuilder sb = new StringBuilder();
191     sb.append("FuzzyRowFilter");
192     sb.append("{fuzzyKeysData=");
193     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
194       sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":");
195       sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}');
196     }
197     sb.append("}, ");
198     return sb.toString();
199   }
200 
201   // Utility methods
202 
203   static enum SatisfiesCode {
204     /** row satisfies fuzzy rule */
205     YES,
206     /** row doesn't satisfy fuzzy rule, but there's possible greater row that does */
207     NEXT_EXISTS,
208     /** row doesn't satisfy fuzzy rule and there's no greater row that does */
209     NO_NEXT
210   }
211 
212   @VisibleForTesting
213   static SatisfiesCode satisfies(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
214     return satisfies(false, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
215   }
216 
217   @VisibleForTesting
218   static SatisfiesCode satisfies(boolean reverse, byte[] row, byte[] fuzzyKeyBytes,
219                                  byte[] fuzzyKeyMeta) {
220     return satisfies(reverse, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
221   }
222 
223   private static SatisfiesCode satisfies(boolean reverse, byte[] row, int offset, int length,
224                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
225     if (row == null) {
226       // do nothing, let scan to proceed
227       return SatisfiesCode.YES;
228     }
229 
230     Order order = Order.orderFor(reverse);
231     boolean nextRowKeyCandidateExists = false;
232 
233     for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) {
234       // First, checking if this position is fixed and not equals the given one
235       boolean byteAtPositionFixed = fuzzyKeyMeta[i] == 0;
236       boolean fixedByteIncorrect = byteAtPositionFixed && fuzzyKeyBytes[i] != row[i + offset];
237       if (fixedByteIncorrect) {
238         // in this case there's another row that satisfies fuzzy rule and bigger than this row
239         if (nextRowKeyCandidateExists) {
240           return SatisfiesCode.NEXT_EXISTS;
241         }
242 
243         // If this row byte is less than fixed then there's a byte array bigger than
244         // this row and which satisfies the fuzzy rule. Otherwise there's no such byte array:
245         // this row is simply bigger than any byte array that satisfies the fuzzy rule
246         boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF);
247         if (rowByteLessThanFixed && !reverse) {
248           return SatisfiesCode.NEXT_EXISTS;
249         } else if (!rowByteLessThanFixed && reverse) {
250           return SatisfiesCode.NEXT_EXISTS;
251         } else {
252           return SatisfiesCode.NO_NEXT;
253         }
254       }
255 
256       // Second, checking if this position is not fixed and byte value is not the biggest. In this
257       // case there's a byte array bigger than this row and which satisfies the fuzzy rule. To get
258       // bigger byte array that satisfies the rule we need to just increase this byte
259       // (see the code of getNextForFuzzyRule below) by one.
260       // Note: if non-fixed byte is already at biggest value, this doesn't allow us to say there's
261       //       bigger one that satisfies the rule as it can't be increased.
262       if (fuzzyKeyMeta[i] == 1 && !order.isMax(fuzzyKeyBytes[i])) {
263         nextRowKeyCandidateExists = true;
264       }
265     }
266 
267     return SatisfiesCode.YES;
268   }
269 
270   @VisibleForTesting
271   static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
272     return getNextForFuzzyRule(false, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
273   }
274 
275   @VisibleForTesting
276   static byte[] getNextForFuzzyRule(boolean reverse, byte[] row, byte[] fuzzyKeyBytes,
277                                     byte[] fuzzyKeyMeta) {
278     return getNextForFuzzyRule(reverse, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
279   }
280 
281   /** Abstracts directional comparisons based on scan direction. */
282   private enum Order {
283     ASC {
284       public boolean lt(int lhs, int rhs) {
285         return lhs < rhs;
286       }
287       public boolean gt(int lhs, int rhs) {
288         return lhs > rhs;
289       }
290       public byte inc(byte val) {
291         // TODO: what about over/underflow?
292         return (byte) (val + 1);
293       }
294       public boolean isMax(byte val) {
295         return val == (byte) 0xff;
296       }
297       public byte min() {
298         return 0;
299       }
300     },
301     DESC {
302       public boolean lt(int lhs, int rhs) {
303         return lhs > rhs;
304       }
305       public boolean gt(int lhs, int rhs) {
306         return lhs < rhs;
307       }
308       public byte inc(byte val) {
309         // TODO: what about over/underflow?
310         return (byte) (val - 1);
311       }
312       public boolean isMax(byte val) {
313         return val == 0;
314       }
315       public byte min() {
316         return (byte) 0xFF;
317       }
318     };
319 
320     public static Order orderFor(boolean reverse) {
321       return reverse ? DESC : ASC;
322     }
323 
324     /** Returns true when {@code lhs < rhs}. */
325     public abstract boolean lt(int lhs, int rhs);
326     /** Returns true when {@code lhs > rhs}. */
327     public abstract boolean gt(int lhs, int rhs);
328     /** Returns {@code val} incremented by 1. */
329     public abstract byte inc(byte val);
330     /** Return true when {@code val} is the maximum value */
331     public abstract boolean isMax(byte val);
332     /** Return the minimum value according to this ordering scheme. */
333     public abstract byte min();
334   }
335 
336   /**
337    * @return greater byte array than given (row) which satisfies the fuzzy rule if it exists,
338    *         null otherwise
339    */
340   private static byte[] getNextForFuzzyRule(boolean reverse, byte[] row, int offset, int length,
341                                             byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
342     // To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" than
343     // the given one we do the following:
344     // 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes
345     // 2. if during the first step given row did not increase, then we increase the value at
346     //    the first "non-fixed" position (where it is not maximum already)
347 
348     // It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position
349     // values than otherwise.
350     byte[] result = Arrays.copyOf(fuzzyKeyBytes,
351                                   length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length);
352     if (reverse && length > fuzzyKeyBytes.length) {
353       // we need trailing 0xff's instead of trailing 0x00's
354       for (int i = fuzzyKeyBytes.length; i < result.length; i++) {
355         result[i] = (byte) 0xFF;
356       }
357     }
358     int toInc = -1;
359     final Order order = Order.orderFor(reverse);
360 
361     boolean increased = false;
362     for (int i = 0; i < result.length; i++) {
363       if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
364         result[i] = row[offset + i];
365         if (!order.isMax(row[i])) {
366           // this is "non-fixed" position and is not at max value, hence we can increase it
367           toInc = i;
368         }
369       } else if (i < fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) {
370         if (order.lt((row[i + offset] & 0xFF), (fuzzyKeyBytes[i] & 0xFF))) {
371           // if setting value for any fixed position increased the original array,
372           // we are OK
373           increased = true;
374           break;
375         }
376 
377         if (order.gt((row[i + offset] & 0xFF), (fuzzyKeyBytes[i] & 0xFF))) {
378           // if setting value for any fixed position makes array "smaller", then just stop:
379           // in case we found some non-fixed position to increase we will do it, otherwise
380           // there's no "next" row key that satisfies fuzzy rule and "greater" than given row
381           break;
382         }
383       }
384     }
385 
386     if (!increased) {
387       if (toInc < 0) {
388         return null;
389       }
390       result[toInc] = order.inc(result[toInc]);
391 
392       // Setting all "non-fixed" positions to zeroes to the right of the one we increased so
393       // that found "next" row key is the smallest possible
394       for (int i = toInc + 1; i < result.length; i++) {
395         if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
396           result[i] = order.min();
397         }
398       }
399     }
400 
401     return result;
402   }
403 
404   /**
405    * @return true if and only if the fields of the filter that are serialized
406    * are equal to the corresponding fields in other.  Used for testing.
407    */
408   boolean areSerializedFieldsEqual(Filter o) {
409     if (o == this) return true;
410     if (!(o instanceof FuzzyRowFilter)) return false;
411 
412     FuzzyRowFilter other = (FuzzyRowFilter)o;
413     if (this.fuzzyKeysData.size() != other.fuzzyKeysData.size()) return false;
414     for (int i = 0; i < fuzzyKeysData.size(); ++i) {
415       Pair<byte[], byte[]> thisData = this.fuzzyKeysData.get(i);
416       Pair<byte[], byte[]> otherData = other.fuzzyKeysData.get(i);
417       if (!(Bytes.equals(thisData.getFirst(), otherData.getFirst())
418         && Bytes.equals(thisData.getSecond(), otherData.getSecond()))) {
419         return false;
420       }
421     }
422     return true;
423   }
424 }