View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.filter;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.List;
23  
24  import org.apache.hadoop.classification.InterfaceAudience;
25  import org.apache.hadoop.classification.InterfaceStability;
26  import org.apache.hadoop.hbase.Cell;
27  import org.apache.hadoop.hbase.KeyValueUtil;
28  import org.apache.hadoop.hbase.exceptions.DeserializationException;
29  import org.apache.hadoop.hbase.protobuf.generated.FilterProtos;
30  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.BytesBytesPair;
31  import org.apache.hadoop.hbase.util.ByteStringer;
32  import org.apache.hadoop.hbase.util.Bytes;
33  import org.apache.hadoop.hbase.util.Pair;
34  
35  import com.google.protobuf.InvalidProtocolBufferException;
36  
37  /**
38   * Filters data based on fuzzy row key. Performs fast-forwards during scanning.
39   * It takes pairs (row key, fuzzy info) to match row keys. Where fuzzy info is
40   * a byte array with 0 or 1 as its values:
41   * <ul>
42   *   <li>
43   *     0 - means that this byte in provided row key is fixed, i.e. row key's byte at same position
44   *         must match
45   *   </li>
46   *   <li>
47   *     1 - means that this byte in provided row key is NOT fixed, i.e. row key's byte at this
48   *         position can be different from the one in provided row key
49   *   </li>
50   * </ul>
51   *
52   *
53   * Example:
54   * Let's assume row key format is userId_actionId_year_month. Length of userId is fixed
55   * and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively.
56   *
57   * Let's assume that we need to fetch all users that performed certain action (encoded as "99")
58   * in Jan of any year. Then the pair (row key, fuzzy info) would be the following:
59   * row key = "????_99_????_01" (one can use any value instead of "?")
60   * fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00"
61   *
62   * I.e. fuzzy info tells the matching mask is "????_99_????_01", where at ? can be any value.
63   *
64   */
65  @InterfaceAudience.Public
66  @InterfaceStability.Evolving
67  public class FuzzyRowFilter extends FilterBase {
68    private List<Pair<byte[], byte[]>> fuzzyKeysData;
69    private boolean done = false;
70  
71    public FuzzyRowFilter(List<Pair<byte[], byte[]>> fuzzyKeysData) {
72      this.fuzzyKeysData = fuzzyKeysData;
73    }
74  
75    // TODO: possible improvement: save which fuzzy row key to use when providing a hint
76    @Override
77    public ReturnCode filterKeyValue(Cell c) {
78      // assigning "worst" result first and looking for better options
79      SatisfiesCode bestOption = SatisfiesCode.NO_NEXT;
80      for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
81        SatisfiesCode satisfiesCode = satisfies(c.getRowArray(), c.getRowOffset(),
82            c.getRowLength(), fuzzyData.getFirst(), fuzzyData.getSecond());
83        if (satisfiesCode == SatisfiesCode.YES) {
84          return ReturnCode.INCLUDE;
85        }
86  
87        if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) {
88          bestOption = SatisfiesCode.NEXT_EXISTS;
89        }
90      }
91  
92      if (bestOption == SatisfiesCode.NEXT_EXISTS) {
93        return ReturnCode.SEEK_NEXT_USING_HINT;
94      }
95  
96      // the only unhandled SatisfiesCode is NO_NEXT, i.e. we are done
97      done = true;
98      return ReturnCode.NEXT_ROW;
99    }
100 
101   @Override
102   public Cell getNextCellHint(Cell currentCell) {
103     byte[] nextRowKey = null;
104     // Searching for the "smallest" row key that satisfies at least one fuzzy row key
105     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
106       byte[] nextRowKeyCandidate = getNextForFuzzyRule(currentCell.getRowArray(),
107           currentCell.getRowOffset(), currentCell.getRowLength(), fuzzyData.getFirst(),
108           fuzzyData.getSecond());
109       if (nextRowKeyCandidate == null) {
110         continue;
111       }
112       if (nextRowKey == null || Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0) {
113         nextRowKey = nextRowKeyCandidate;
114       }
115     }
116 
117     if (nextRowKey == null) {
118       // SHOULD NEVER happen
119       // TODO: is there a better way than throw exception? (stop the scanner?)
120       throw new IllegalStateException("No next row key that satisfies fuzzy exists when"
121           + " getNextKeyHint() is invoked." + " Filter: " + this.toString() + " currentKV: "
122           + KeyValueUtil.ensureKeyValue(currentCell).toString());
123     }
124 
125     return KeyValueUtil.createFirstOnRow(nextRowKey);
126   }
127 
128   @Override
129   public boolean filterAllRemaining() {
130     return done;
131   }
132 
133   /**
134    * @return The filter serialized using pb
135    */
136   public byte [] toByteArray() {
137     FilterProtos.FuzzyRowFilter.Builder builder =
138       FilterProtos.FuzzyRowFilter.newBuilder();
139     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
140       BytesBytesPair.Builder bbpBuilder = BytesBytesPair.newBuilder();
141       bbpBuilder.setFirst(ByteStringer.wrap(fuzzyData.getFirst()));
142       bbpBuilder.setSecond(ByteStringer.wrap(fuzzyData.getSecond()));
143       builder.addFuzzyKeysData(bbpBuilder);
144     }
145     return builder.build().toByteArray();
146   }
147 
148   /**
149    * @param pbBytes A pb serialized {@link FuzzyRowFilter} instance
150    * @return An instance of {@link FuzzyRowFilter} made from <code>bytes</code>
151    * @throws DeserializationException
152    * @see #toByteArray
153    */
154   public static FuzzyRowFilter parseFrom(final byte [] pbBytes)
155   throws DeserializationException {
156     FilterProtos.FuzzyRowFilter proto;
157     try {
158       proto = FilterProtos.FuzzyRowFilter.parseFrom(pbBytes);
159     } catch (InvalidProtocolBufferException e) {
160       throw new DeserializationException(e);
161     }
162     int count = proto.getFuzzyKeysDataCount();
163     ArrayList<Pair<byte[], byte[]>> fuzzyKeysData= new ArrayList<Pair<byte[], byte[]>>(count);
164     for (int i = 0; i < count; ++i) {
165       BytesBytesPair current = proto.getFuzzyKeysData(i);
166       byte[] keyBytes = current.getFirst().toByteArray();
167       byte[] keyMeta = current.getSecond().toByteArray();
168       fuzzyKeysData.add(new Pair<byte[], byte[]>(keyBytes, keyMeta));
169     }
170     return new FuzzyRowFilter(fuzzyKeysData);
171   }
172 
173   @Override
174   public String toString() {
175     final StringBuilder sb = new StringBuilder();
176     sb.append("FuzzyRowFilter");
177     sb.append("{fuzzyKeysData=");
178     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
179       sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":");
180       sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}');
181     }
182     sb.append("}, ");
183     return sb.toString();
184   }
185 
186   // Utility methods
187 
188   static enum SatisfiesCode {
189     // row satisfies fuzzy rule
190     YES,
191     // row doesn't satisfy fuzzy rule, but there's possible greater row that does
192     NEXT_EXISTS,
193     // row doesn't satisfy fuzzy rule and there's no greater row that does
194     NO_NEXT
195   }
196 
197   static SatisfiesCode satisfies(byte[] row,
198                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
199     return satisfies(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
200   }
201 
202   private static SatisfiesCode satisfies(byte[] row, int offset, int length,
203                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
204     if (row == null) {
205       // do nothing, let scan to proceed
206       return SatisfiesCode.YES;
207     }
208 
209     boolean nextRowKeyCandidateExists = false;
210 
211     for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) {
212       // First, checking if this position is fixed and not equals the given one
213       boolean byteAtPositionFixed = fuzzyKeyMeta[i] == 0;
214       boolean fixedByteIncorrect = byteAtPositionFixed && fuzzyKeyBytes[i] != row[i + offset];
215       if (fixedByteIncorrect) {
216         // in this case there's another row that satisfies fuzzy rule and bigger than this row
217         if (nextRowKeyCandidateExists) {
218           return SatisfiesCode.NEXT_EXISTS;
219         }
220 
221         // If this row byte is less than fixed then there's a byte array bigger than
222         // this row and which satisfies the fuzzy rule. Otherwise there's no such byte array:
223         // this row is simply bigger than any byte array that satisfies the fuzzy rule
224         boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF);
225         return  rowByteLessThanFixed ? SatisfiesCode.NEXT_EXISTS : SatisfiesCode.NO_NEXT;
226       }
227 
228       // Second, checking if this position is not fixed and byte value is not the biggest. In this
229       // case there's a byte array bigger than this row and which satisfies the fuzzy rule. To get
230       // bigger byte array that satisfies the rule we need to just increase this byte
231       // (see the code of getNextForFuzzyRule below) by one.
232       // Note: if non-fixed byte is already at biggest value, this doesn't allow us to say there's
233       //       bigger one that satisfies the rule as it can't be increased.
234       if (fuzzyKeyMeta[i] == 1 && !isMax(fuzzyKeyBytes[i])) {
235         nextRowKeyCandidateExists = true;
236       }
237     }
238 
239     return SatisfiesCode.YES;
240   }
241 
242   private static boolean isMax(byte fuzzyKeyByte) {
243     return (fuzzyKeyByte & 0xFF) == 255;
244   }
245 
246   static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
247     return getNextForFuzzyRule(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
248   }
249 
250   /**
251    * @return greater byte array than given (row) which satisfies the fuzzy rule if it exists,
252    *         null otherwise
253    */
254   private static byte[] getNextForFuzzyRule(byte[] row, int offset, int length,
255                                             byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
256     // To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" than
257     // the given one we do the following:
258     // 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes
259     // 2. if during the first step given row did not increase, then we increase the value at
260     //    the first "non-fixed" position (where it is not maximum already)
261 
262     // It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position
263     // values than otherwise.
264     byte[] result = Arrays.copyOf(fuzzyKeyBytes,
265                                   length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length);
266     int toInc = -1;
267 
268     boolean increased = false;
269     for (int i = 0; i < result.length; i++) {
270       if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
271         result[i] = row[offset + i];
272         if (!isMax(row[i])) {
273           // this is "non-fixed" position and is not at max value, hence we can increase it
274           toInc = i;
275         }
276       } else if (i < fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) {
277         if ((row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF)) {
278           // if setting value for any fixed position increased the original array,
279           // we are OK
280           increased = true;
281           break;
282         }
283         if ((row[i + offset] & 0xFF) > (fuzzyKeyBytes[i] & 0xFF)) {
284           // if setting value for any fixed position makes array "smaller", then just stop:
285           // in case we found some non-fixed position to increase we will do it, otherwise
286           // there's no "next" row key that satisfies fuzzy rule and "greater" than given row
287           break;
288         }
289       }
290     }
291 
292     if (!increased) {
293       if (toInc < 0) {
294         return null;
295       }
296       result[toInc]++;
297 
298       // Setting all "non-fixed" positions to zeroes to the right of the one we increased so
299       // that found "next" row key is the smallest possible
300       for (int i = toInc + 1; i < result.length; i++) {
301         if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
302           result[i] = 0;
303         }
304       }
305     }
306 
307     return result;
308   }
309 
310   /**
311    * @param other
312    * @return true if and only if the fields of the filter that are serialized
313    * are equal to the corresponding fields in other.  Used for testing.
314    */
315   boolean areSerializedFieldsEqual(Filter o) {
316     if (o == this) return true;
317     if (!(o instanceof FuzzyRowFilter)) return false;
318 
319     FuzzyRowFilter other = (FuzzyRowFilter)o;
320     if (this.fuzzyKeysData.size() != other.fuzzyKeysData.size()) return false;
321     for (int i = 0; i < fuzzyKeysData.size(); ++i) {
322       Pair<byte[], byte[]> thisData = this.fuzzyKeysData.get(i);
323       Pair<byte[], byte[]> otherData = other.fuzzyKeysData.get(i);
324       if (!(Bytes.equals(thisData.getFirst(), otherData.getFirst())
325         && Bytes.equals(thisData.getSecond(), otherData.getSecond()))) {
326         return false;
327       }
328     }
329     return true;
330   }
331 
332 }