View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.filter;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.List;
23  
24  import org.apache.hadoop.classification.InterfaceAudience;
25  import org.apache.hadoop.classification.InterfaceStability;
26  import org.apache.hadoop.hbase.Cell;
27  import org.apache.hadoop.hbase.KeyValue;
28  import org.apache.hadoop.hbase.KeyValueUtil;
29  import org.apache.hadoop.hbase.exceptions.DeserializationException;
30  import org.apache.hadoop.hbase.protobuf.generated.FilterProtos;
31  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.BytesBytesPair;
32  import org.apache.hadoop.hbase.util.Bytes;
33  import org.apache.hadoop.hbase.util.Pair;
34  
35  import com.google.protobuf.HBaseZeroCopyByteString;
36  import com.google.protobuf.InvalidProtocolBufferException;
37  
38  /**
39   * Filters data based on fuzzy row key. Performs fast-forwards during scanning.
40   * It takes pairs (row key, fuzzy info) to match row keys. Where fuzzy info is
41   * a byte array with 0 or 1 as its values:
42   * <ul>
43   *   <li>
44   *     0 - means that this byte in provided row key is fixed, i.e. row key's byte at same position
45   *         must match
46   *   </li>
47   *   <li>
48   *     1 - means that this byte in provided row key is NOT fixed, i.e. row key's byte at this
49   *         position can be different from the one in provided row key
50   *   </li>
51   * </ul>
52   *
53   *
54   * Example:
55   * Let's assume row key format is userId_actionId_year_month. Length of userId is fixed
56   * and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively.
57   *
58   * Let's assume that we need to fetch all users that performed certain action (encoded as "99")
59   * in Jan of any year. Then the pair (row key, fuzzy info) would be the following:
60   * row key = "????_99_????_01" (one can use any value instead of "?")
61   * fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00"
62   *
63   * I.e. fuzzy info tells the matching mask is "????_99_????_01", where at ? can be any value.
64   *
65   */
66  @InterfaceAudience.Public
67  @InterfaceStability.Evolving
68  public class FuzzyRowFilter extends FilterBase {
69    private List<Pair<byte[], byte[]>> fuzzyKeysData;
70    private boolean done = false;
71  
72    public FuzzyRowFilter(List<Pair<byte[], byte[]>> fuzzyKeysData) {
73      this.fuzzyKeysData = fuzzyKeysData;
74    }
75  
76    // TODO: possible improvement: save which fuzzy row key to use when providing a hint
77    @Override
78    public ReturnCode filterKeyValue(Cell kv) {
79      // TODO add getRow() equivalent to Cell or change satisfies to take b[],o,l style args.
80      KeyValue v = KeyValueUtil.ensureKeyValue(kv);
81  
82      byte[] rowKey = v.getRow();
83      // assigning "worst" result first and looking for better options
84      SatisfiesCode bestOption = SatisfiesCode.NO_NEXT;
85      for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
86        SatisfiesCode satisfiesCode =
87                satisfies(rowKey, fuzzyData.getFirst(), fuzzyData.getSecond());
88        if (satisfiesCode == SatisfiesCode.YES) {
89          return ReturnCode.INCLUDE;
90        }
91  
92        if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) {
93          bestOption = SatisfiesCode.NEXT_EXISTS;
94        }
95      }
96  
97      if (bestOption == SatisfiesCode.NEXT_EXISTS) {
98        return ReturnCode.SEEK_NEXT_USING_HINT;
99      }
100 
101     // the only unhandled SatisfiesCode is NO_NEXT, i.e. we are done
102     done = true;
103     return ReturnCode.NEXT_ROW;
104   }
105 
106   @Override
107   public Cell getNextCellHint(Cell currentKV) {
108     // TODO make matching Column a cell method or CellUtil method.
109     KeyValue v = KeyValueUtil.ensureKeyValue(currentKV);
110 
111     byte[] rowKey = v.getRow();
112     byte[] nextRowKey = null;
113     // Searching for the "smallest" row key that satisfies at least one fuzzy row key
114     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
115       byte[] nextRowKeyCandidate = getNextForFuzzyRule(rowKey,
116               fuzzyData.getFirst(), fuzzyData.getSecond());
117       if (nextRowKeyCandidate == null) {
118         continue;
119       }
120       if (nextRowKey == null || Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0) {
121         nextRowKey = nextRowKeyCandidate;
122       }
123     }
124 
125     if (nextRowKey == null) {
126       // SHOULD NEVER happen
127       // TODO: is there a better way than throw exception? (stop the scanner?)
128       throw new IllegalStateException("No next row key that satisfies fuzzy exists when" +
129                                          " getNextKeyHint() is invoked." +
130                                          " Filter: " + this.toString() +
131                                          " currentKV: " + currentKV.toString());
132     }
133 
134     return KeyValueUtil.createFirstOnRow(nextRowKey);
135   }
136 
137   @Override
138   public boolean filterAllRemaining() {
139     return done;
140   }
141 
142   /**
143    * @return The filter serialized using pb
144    */
145   public byte [] toByteArray() {
146     FilterProtos.FuzzyRowFilter.Builder builder =
147       FilterProtos.FuzzyRowFilter.newBuilder();
148     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
149       BytesBytesPair.Builder bbpBuilder = BytesBytesPair.newBuilder();
150       bbpBuilder.setFirst(HBaseZeroCopyByteString.wrap(fuzzyData.getFirst()));
151       bbpBuilder.setSecond(HBaseZeroCopyByteString.wrap(fuzzyData.getSecond()));
152       builder.addFuzzyKeysData(bbpBuilder);
153     }
154     return builder.build().toByteArray();
155   }
156 
157   /**
158    * @param pbBytes A pb serialized {@link FuzzyRowFilter} instance
159    * @return An instance of {@link FuzzyRowFilter} made from <code>bytes</code>
160    * @throws DeserializationException
161    * @see #toByteArray
162    */
163   public static FuzzyRowFilter parseFrom(final byte [] pbBytes)
164   throws DeserializationException {
165     FilterProtos.FuzzyRowFilter proto;
166     try {
167       proto = FilterProtos.FuzzyRowFilter.parseFrom(pbBytes);
168     } catch (InvalidProtocolBufferException e) {
169       throw new DeserializationException(e);
170     }
171     int count = proto.getFuzzyKeysDataCount();
172     ArrayList<Pair<byte[], byte[]>> fuzzyKeysData= new ArrayList<Pair<byte[], byte[]>>(count);
173     for (int i = 0; i < count; ++i) {
174       BytesBytesPair current = proto.getFuzzyKeysData(i);
175       byte[] keyBytes = current.getFirst().toByteArray();
176       byte[] keyMeta = current.getSecond().toByteArray();
177       fuzzyKeysData.add(new Pair<byte[], byte[]>(keyBytes, keyMeta));
178     }
179     return new FuzzyRowFilter(fuzzyKeysData);
180   }
181 
182   @Override
183   public String toString() {
184     final StringBuilder sb = new StringBuilder();
185     sb.append("FuzzyRowFilter");
186     sb.append("{fuzzyKeysData=");
187     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
188       sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":");
189       sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}');
190     }
191     sb.append("}, ");
192     return sb.toString();
193   }
194 
195   // Utility methods
196 
197   static enum SatisfiesCode {
198     // row satisfies fuzzy rule
199     YES,
200     // row doesn't satisfy fuzzy rule, but there's possible greater row that does
201     NEXT_EXISTS,
202     // row doesn't satisfy fuzzy rule and there's no greater row that does
203     NO_NEXT
204   }
205 
206   static SatisfiesCode satisfies(byte[] row,
207                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
208     return satisfies(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
209   }
210 
211   private static SatisfiesCode satisfies(byte[] row, int offset, int length,
212                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
213     if (row == null) {
214       // do nothing, let scan to proceed
215       return SatisfiesCode.YES;
216     }
217 
218     boolean nextRowKeyCandidateExists = false;
219 
220     for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) {
221       // First, checking if this position is fixed and not equals the given one
222       boolean byteAtPositionFixed = fuzzyKeyMeta[i] == 0;
223       boolean fixedByteIncorrect = byteAtPositionFixed && fuzzyKeyBytes[i] != row[i + offset];
224       if (fixedByteIncorrect) {
225         // in this case there's another row that satisfies fuzzy rule and bigger than this row
226         if (nextRowKeyCandidateExists) {
227           return SatisfiesCode.NEXT_EXISTS;
228         }
229 
230         // If this row byte is less than fixed then there's a byte array bigger than
231         // this row and which satisfies the fuzzy rule. Otherwise there's no such byte array:
232         // this row is simply bigger than any byte array that satisfies the fuzzy rule
233         boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF);
234         return  rowByteLessThanFixed ? SatisfiesCode.NEXT_EXISTS : SatisfiesCode.NO_NEXT;
235       }
236 
237       // Second, checking if this position is not fixed and byte value is not the biggest. In this
238       // case there's a byte array bigger than this row and which satisfies the fuzzy rule. To get
239       // bigger byte array that satisfies the rule we need to just increase this byte
240       // (see the code of getNextForFuzzyRule below) by one.
241       // Note: if non-fixed byte is already at biggest value, this doesn't allow us to say there's
242       //       bigger one that satisfies the rule as it can't be increased.
243       if (fuzzyKeyMeta[i] == 1 && !isMax(fuzzyKeyBytes[i])) {
244         nextRowKeyCandidateExists = true;
245       }
246     }
247 
248     return SatisfiesCode.YES;
249   }
250 
251   private static boolean isMax(byte fuzzyKeyByte) {
252     return (fuzzyKeyByte & 0xFF) == 255;
253   }
254 
255   static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
256     return getNextForFuzzyRule(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
257   }
258 
259   /**
260    * @return greater byte array than given (row) which satisfies the fuzzy rule if it exists,
261    *         null otherwise
262    */
263   private static byte[] getNextForFuzzyRule(byte[] row, int offset, int length,
264                                             byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
265     // To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" than
266     // the given one we do the following:
267     // 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes
268     // 2. if during the first step given row did not increase, then we increase the value at
269     //    the first "non-fixed" position (where it is not maximum already)
270 
271     // It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position
272     // values than otherwise.
273     byte[] result = Arrays.copyOf(fuzzyKeyBytes,
274                                   length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length);
275     int toInc = -1;
276 
277     boolean increased = false;
278     for (int i = 0; i < result.length; i++) {
279       if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
280         result[i] = row[offset + i];
281         if (!isMax(row[i])) {
282           // this is "non-fixed" position and is not at max value, hence we can increase it
283           toInc = i;
284         }
285       } else if (i < fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) {
286         if ((row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF)) {
287           // if setting value for any fixed position increased the original array,
288           // we are OK
289           increased = true;
290           break;
291         }
292         if ((row[i + offset] & 0xFF) > (fuzzyKeyBytes[i] & 0xFF)) {
293           // if setting value for any fixed position makes array "smaller", then just stop:
294           // in case we found some non-fixed position to increase we will do it, otherwise
295           // there's no "next" row key that satisfies fuzzy rule and "greater" than given row
296           break;
297         }
298       }
299     }
300 
301     if (!increased) {
302       if (toInc < 0) {
303         return null;
304       }
305       result[toInc]++;
306 
307       // Setting all "non-fixed" positions to zeroes to the right of the one we increased so
308       // that found "next" row key is the smallest possible
309       for (int i = toInc + 1; i < result.length; i++) {
310         if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
311           result[i] = 0;
312         }
313       }
314     }
315 
316     return result;
317   }
318 
319   /**
320    * @param other
321    * @return true if and only if the fields of the filter that are serialized
322    * are equal to the corresponding fields in other.  Used for testing.
323    */
324   boolean areSerializedFieldsEqual(Filter o) {
325     if (o == this) return true;
326     if (!(o instanceof FuzzyRowFilter)) return false;
327 
328     FuzzyRowFilter other = (FuzzyRowFilter)o;
329     if (this.fuzzyKeysData.size() != other.fuzzyKeysData.size()) return false;
330     for (int i = 0; i < fuzzyKeysData.size(); ++i) {
331       Pair<byte[], byte[]> thisData = this.fuzzyKeysData.get(i);
332       Pair<byte[], byte[]> otherData = other.fuzzyKeysData.get(i);
333       if (!(Bytes.equals(thisData.getFirst(), otherData.getFirst())
334         && Bytes.equals(thisData.getSecond(), otherData.getSecond()))) {
335         return false;
336       }
337     }
338     return true;
339   }
340 
341 }