View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.filter;
19  
20  import java.io.DataInput;
21  import java.io.DataOutput;
22  import java.io.IOException;
23  import java.util.ArrayList;
24  import java.util.Arrays;
25  import java.util.List;
26  
27  import org.apache.hadoop.hbase.KeyValue;
28  import org.apache.hadoop.hbase.util.Bytes;
29  import org.apache.hadoop.hbase.util.Pair;
30  
31  /**
32   * Filters data based on fuzzy row key. Performs fast-forwards during scanning.
33   * It takes pairs (row key, fuzzy info) to match row keys. Where fuzzy info is
34   * a byte array with 0 or 1 as its values:
35   * <ul>
36   *   <li>
37   *     0 - means that this byte in provided row key is fixed, i.e. row key's byte at same position
38   *         must match
39   *   </li>
40   *   <li>
41   *     1 - means that this byte in provided row key is NOT fixed, i.e. row key's byte at this
42   *         position can be different from the one in provided row key
43   *   </li>
44   * </ul>
45   *
46   *
47   * Example:
48   * Let's assume row key format is userId_actionId_year_month. Length of userId is fixed
49   * and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively.
50   *
51   * Let's assume that we need to fetch all users that performed certain action (encoded as "99")
52   * in Jan of any year. Then the pair (row key, fuzzy info) would be the following:
53   * row key = "????_99_????_01" (one can use any value instead of "?")
54   * fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00"
55   *
56   * I.e. fuzzy info tells the matching mask is "????_99_????_01", where at ? can be any value.
57   *
58   */
59  public class FuzzyRowFilter extends FilterBase {
60    private List<Pair<byte[], byte[]>> fuzzyKeysData;
61    private boolean done = false;
62  
63    /**
64     * Used internally for reflection, do NOT use it directly
65     */
66    public FuzzyRowFilter() {
67    }
68  
69    public FuzzyRowFilter(List<Pair<byte[], byte[]>> fuzzyKeysData) {
70      this.fuzzyKeysData = fuzzyKeysData;
71    }
72  
73    // TODO: possible improvement: save which fuzzy row key to use when providing a hint
74    @Override
75    public ReturnCode filterKeyValue(KeyValue kv) {
76      byte[] rowKey = kv.getRow();
77      // assigning "worst" result first and looking for better options
78      SatisfiesCode bestOption = SatisfiesCode.NO_NEXT;
79      for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
80        SatisfiesCode satisfiesCode =
81                satisfies(rowKey, fuzzyData.getFirst(), fuzzyData.getSecond());
82        if (satisfiesCode == SatisfiesCode.YES) {
83          return ReturnCode.INCLUDE;
84        }
85  
86        if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) {
87          bestOption = SatisfiesCode.NEXT_EXISTS;
88        }
89      }
90  
91      if (bestOption == SatisfiesCode.NEXT_EXISTS) {
92        return ReturnCode.SEEK_NEXT_USING_HINT;
93      }
94  
95      // the only unhandled SatisfiesCode is NO_NEXT, i.e. we are done
96      done = true;
97      return ReturnCode.NEXT_ROW;
98    }
99  
100   @Override
101   public KeyValue getNextKeyHint(KeyValue currentKV) {
102     byte[] rowKey = currentKV.getRow();
103     byte[] nextRowKey = null;
104     // Searching for the "smallest" row key that satisfies at least one fuzzy row key
105     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
106       byte[] nextRowKeyCandidate = getNextForFuzzyRule(rowKey,
107               fuzzyData.getFirst(), fuzzyData.getSecond());
108       if (nextRowKeyCandidate == null) {
109         continue;
110       }
111       if (nextRowKey == null || Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0) {
112         nextRowKey = nextRowKeyCandidate;
113       }
114     }
115 
116     if (nextRowKey == null) {
117       // SHOULD NEVER happen
118       // TODO: is there a better way than throw exception? (stop the scanner?)
119       throw new IllegalStateException("No next row key that satisfies fuzzy exists when" +
120                                          " getNextKeyHint() is invoked." +
121                                          " Filter: " + this.toString() +
122                                          " currentKV: " + currentKV.toString());
123     }
124 
125     return KeyValue.createFirstOnRow(nextRowKey);
126   }
127 
128   @Override
129   public boolean filterAllRemaining() {
130     return done;
131   }
132 
133   @Override
134   public void write(DataOutput dataOutput) throws IOException {
135     dataOutput.writeInt(this.fuzzyKeysData.size());
136     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
137       Bytes.writeByteArray(dataOutput, fuzzyData.getFirst());
138       Bytes.writeByteArray(dataOutput, fuzzyData.getSecond());
139     }
140   }
141 
142   @Override
143   public void readFields(DataInput dataInput) throws IOException {
144     int count = dataInput.readInt();
145     this.fuzzyKeysData = new ArrayList<Pair<byte[], byte[]>>(count);
146     for (int i = 0; i < count; i++) {
147       byte[] keyBytes = Bytes.readByteArray(dataInput);
148       byte[] keyMeta = Bytes.readByteArray(dataInput);
149       this.fuzzyKeysData.add(new Pair<byte[], byte[]>(keyBytes, keyMeta));
150     }
151   }
152 
153   @Override
154   public String toString() {
155     final StringBuilder sb = new StringBuilder();
156     sb.append("FuzzyRowFilter");
157     sb.append("{fuzzyKeysData=");
158     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
159       sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":");
160       sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}');
161     }
162     sb.append("}, ");
163     return sb.toString();
164   }
165 
166   // Utility methods
167 
168   static enum SatisfiesCode {
169     // row satisfies fuzzy rule
170     YES,
171     // row doesn't satisfy fuzzy rule, but there's possible greater row that does
172     NEXT_EXISTS,
173     // row doesn't satisfy fuzzy rule and there's no greater row that does
174     NO_NEXT
175   }
176 
177   static SatisfiesCode satisfies(byte[] row,
178                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
179     return satisfies(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
180   }
181 
182   private static SatisfiesCode satisfies(byte[] row, int offset, int length,
183                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
184     if (row == null) {
185       // do nothing, let scan to proceed
186       return SatisfiesCode.YES;
187     }
188 
189     boolean nextRowKeyCandidateExists = false;
190 
191     for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) {
192       // First, checking if this position is fixed and not equals the given one
193       boolean byteAtPositionFixed = fuzzyKeyMeta[i] == 0;
194       boolean fixedByteIncorrect = byteAtPositionFixed && fuzzyKeyBytes[i] != row[i + offset];
195       if (fixedByteIncorrect) {
196         // in this case there's another row that satisfies fuzzy rule and bigger than this row
197         if (nextRowKeyCandidateExists) {
198           return SatisfiesCode.NEXT_EXISTS;
199         }
200 
201         // If this row byte is less than fixed then there's a byte array bigger than
202         // this row and which satisfies the fuzzy rule. Otherwise there's no such byte array:
203         // this row is simply bigger than any byte array that satisfies the fuzzy rule
204         boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF);
205         return  rowByteLessThanFixed ? SatisfiesCode.NEXT_EXISTS : SatisfiesCode.NO_NEXT;
206       }
207 
208       // Second, checking if this position is not fixed and byte value is not the biggest. In this
209       // case there's a byte array bigger than this row and which satisfies the fuzzy rule. To get
210       // bigger byte array that satisfies the rule we need to just increase this byte
211       // (see the code of getNextForFuzzyRule below) by one.
212       // Note: if non-fixed byte is already at biggest value, this doesn't allow us to say there's
213       //       bigger one that satisfies the rule as it can't be increased.
214       if (fuzzyKeyMeta[i] == 1 && !isMax(fuzzyKeyBytes[i])) {
215         nextRowKeyCandidateExists = true;
216       }
217     }
218 
219     return SatisfiesCode.YES;
220   }
221 
222   private static boolean isMax(byte fuzzyKeyByte) {
223     return (fuzzyKeyByte & 0xFF) == 255;
224   }
225 
226   static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
227     return getNextForFuzzyRule(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
228   }
229 
230   /**
231    * @return greater byte array than given (row) which satisfies the fuzzy rule if it exists,
232    *         null otherwise
233    */
234   private static byte[] getNextForFuzzyRule(byte[] row, int offset, int length,
235                                             byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
236     // To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" than
237     // the given one we do the following:
238     // 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes
239     // 2. if during the first step given row did not increase, then we increase the value at
240     //    the first "non-fixed" position (where it is not maximum already)
241 
242     // It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position
243     // values than otherwise.
244     byte[] result = Arrays.copyOf(fuzzyKeyBytes,
245                                   length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length);
246     int toInc = -1;
247 
248     boolean increased = false;
249     for (int i = 0; i < result.length; i++) {
250       if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
251         result[i] = row[offset + i];
252         if (!isMax(row[i])) {
253           // this is "non-fixed" position and is not at max value, hence we can increase it
254           toInc = i;
255         }
256       } else if (i < fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) {
257         if ((row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF)) {
258           // if setting value for any fixed position increased the original array,
259           // we are OK
260           increased = true;
261           break;
262         }
263         if ((row[i + offset] & 0xFF) > (fuzzyKeyBytes[i] & 0xFF)) {
264           // if setting value for any fixed position makes array "smaller", then just stop:
265           // in case we found some non-fixed position to increase we will do it, otherwise
266           // there's no "next" row key that satisfies fuzzy rule and "greater" than given row
267           break;
268         }
269       }
270     }
271 
272     if (!increased) {
273       if (toInc < 0) {
274         return null;
275       }
276       result[toInc]++;
277 
278       // Setting all "non-fixed" positions to zeroes to the right of the one we increased so
279       // that found "next" row key is the smallest possible
280       for (int i = toInc + 1; i < result.length; i++) {
281         if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
282           result[i] = 0;
283         }
284       }
285     }
286 
287     return result;
288   }
289 }