View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize;
20  
21  import java.util.ArrayList;
22  import java.util.List;
23  
24  import org.apache.hadoop.hbase.classification.InterfaceAudience;
25  import org.apache.hadoop.hbase.util.ByteRange;
26  import org.apache.hadoop.hbase.util.ByteRangeUtils;
27  import org.apache.hadoop.hbase.util.Bytes;
28  import org.apache.hadoop.hbase.util.CollectionUtils;
29  import org.apache.hadoop.hbase.util.SimpleMutableByteRange;
30  import org.apache.hadoop.hbase.util.Strings;
31  
32  import com.google.common.collect.Lists;
33  
34  /**
35   * Individual node in a Trie structure.  Each node is one of 3 types:
36   * <ul>
37   * <li>Branch: an internal trie node that may have a token and must have multiple children, but does
38   * not represent an actual input byte[], hence its numOccurrences is 0
39   * <li>Leaf: a node with no children and where numOccurrences is &gt;= 1.  It's token represents the
40   * last bytes in the input byte[]s.
41   * <li>Nub: a combination of a branch and leaf.  Its token represents the last bytes of input
42   * byte[]s and has numOccurrences &gt;= 1, but it also has child nodes which represent input byte[]s
43   * that add bytes to this nodes input byte[].
44   * </ul>
45   * <br><br>
46   * Example inputs (numInputs=7):
47   * 0: AAA
48   * 1: AAA
49   * 2: AAB
50   * 3: AAB
51   * 4: AAB
52   * 5: AABQQ
53   * 6: AABQQ
54   * <br><br>
55   * Resulting TokenizerNodes:
56   * AA &lt;- branch, numOccurrences=0, tokenStartOffset=0, token.length=2
57   * A  &lt;- leaf, numOccurrences=2, tokenStartOffset=2, token.length=1
58   * B  &lt;- nub, numOccurrences=3, tokenStartOffset=2, token.length=1
59   * QQ &lt;- leaf, numOccurrences=2, tokenStartOffset=3, token.length=2
60   * <br><br>
61   * numInputs == 7 == sum(numOccurrences) == 0 + 2 + 3 + 2
62   */
63  @InterfaceAudience.Private
64  public class TokenizerNode{
65  
66    /*
67     * Ref to data structure wrapper
68     */
69    protected Tokenizer builder;
70  
71    /****************************************************************** 
72     * Tree content/structure used during tokenization 
73     * ****************************************************************/
74  
75    /*
76     * ref to parent trie node
77     */
78    protected TokenizerNode parent;
79  
80    /*
81     * node depth in trie, irrespective of each node's token length
82     */
83    protected int nodeDepth;
84  
85    /*
86     * start index of this token in original byte[]
87     */
88    protected int tokenStartOffset;
89  
90    /*
91     * bytes for this trie node.  can be length 0 in root node
92     */
93    protected ByteRange token;
94  
95    /*
96     * A count of occurrences in the input byte[]s, not the trie structure. 0 for branch nodes, 1+ for
97     * nubs and leaves. If the same byte[] is added to the trie multiple times, this is the only thing
98     * that changes in the tokenizer. As a result, duplicate byte[]s are very inexpensive to encode.
99     */
100   protected int numOccurrences;
101 
102   /*
103    * The maximum fan-out of a byte[] trie is 256, so there are a maximum of 256
104    * child nodes.
105    */
106   protected ArrayList<TokenizerNode> children;
107 
108 
109   /*
110    * Fields used later in the encoding process for sorting the nodes into the order they'll be
111    * written to the output byte[].  With these fields, the TokenizerNode and therefore Tokenizer
112    * are not generic data structures but instead are specific to HBase PrefixTree encoding. 
113    */
114 
115   /*
116    * unique id assigned to each TokenizerNode
117    */
118   protected long id;
119 
120   /*
121    * set >=0 for nubs and leaves
122    */
123   protected int firstInsertionIndex = -1;
124 
125   /*
126    * A positive value indicating how many bytes before the end of the block this node will start. If
127    * the section is 55 bytes and negativeOffset is 9, then the node will start at 46.
128    */
129   protected int negativeIndex = 0;
130 
131   /*
132    * The offset in the output array at which to start writing this node's token bytes.  Influenced
133    * by the lengths of all tokens sorted before this one.
134    */
135   protected int outputArrayOffset = -1;
136 
137 
138   /*********************** construct *****************************/
139 
140   public TokenizerNode(Tokenizer builder, TokenizerNode parent, int nodeDepth,
141       int tokenStartOffset, int tokenOffset, int tokenLength) {
142     this.token = new SimpleMutableByteRange();
143     reconstruct(builder, parent, nodeDepth, tokenStartOffset, tokenOffset, tokenLength);
144     this.children = Lists.newArrayList();
145   }
146 
147   /*
148    * Sub-constructor for initializing all fields without allocating a new object.  Used by the
149    * regular constructor.
150    */
151   public void reconstruct(Tokenizer builder, TokenizerNode parent, int nodeDepth,
152       int tokenStartOffset, int tokenOffset, int tokenLength) {
153     this.builder = builder;
154     this.id = builder.nextNodeId();
155     this.parent = parent;
156     this.nodeDepth = nodeDepth;
157     builder.submitMaxNodeDepthCandidate(nodeDepth);
158     this.tokenStartOffset = tokenStartOffset;
159     this.token.set(builder.tokens, tokenOffset, tokenLength);
160     this.numOccurrences = 1;
161   }
162 
163   /*
164    * Clear the state of this node so that it looks like it was just allocated.
165    */
166   public void reset() {
167     builder = null;
168     parent = null;
169     nodeDepth = 0;
170     tokenStartOffset = 0;
171     token.unset();
172     numOccurrences = 0;
173     children.clear();// branches & nubs
174 
175     // ids/offsets. used during writing to byte[]
176     id = 0;
177     firstInsertionIndex = -1;// set >=0 for nubs and leaves
178     negativeIndex = 0;
179     outputArrayOffset = -1;
180   }
181 
182 
183   /************************* building *********************************/
184 
185   /*
186    * <li>Only public method used during the tokenization process
187    * <li>Requires that the input ByteRange sort after the previous, and therefore after all previous
188    * inputs
189    * <li>Only looks at bytes of the input array that align with this node's token
190    */
191   public void addSorted(final ByteRange bytes) {// recursively build the tree
192 
193     /*
194      * Recurse deeper into the existing trie structure
195      */
196     if (matchesToken(bytes) && CollectionUtils.notEmpty(children)) {
197       TokenizerNode lastChild = CollectionUtils.getLast(children);
198       if (lastChild.partiallyMatchesToken(bytes)) {
199         lastChild.addSorted(bytes);
200         return;
201       }
202     }
203 
204     /*
205      * Recursion ended.  We must either
206      * <li>1: increment numOccurrences if this input was equal to the previous
207      * <li>2: convert this node from a leaf to a nub, and add a new child leaf
208      * <li>3: split this node into a branch and leaf, and then add a second leaf
209      */
210 
211     // add it as a child of this node
212     int numIdenticalTokenBytes = numIdenticalBytes(bytes);// should be <= token.length
213     int tailOffset = tokenStartOffset + numIdenticalTokenBytes;
214     int tailLength = bytes.getLength() - tailOffset;
215 
216     if (numIdenticalTokenBytes == token.getLength()) {
217       if (tailLength == 0) {// identical to this node (case 1)
218         incrementNumOccurrences(1);
219       } else {// identical to this node, but with a few extra tailing bytes. (leaf -> nub) (case 2)
220         int childNodeDepth = nodeDepth + 1;
221         int childTokenStartOffset = tokenStartOffset + numIdenticalTokenBytes;
222         TokenizerNode newChildNode = builder.addNode(this, childNodeDepth, childTokenStartOffset,
223           bytes, tailOffset);
224         addChild(newChildNode);
225       }
226     } else {//numIdenticalBytes > 0, split into branch/leaf and then add second leaf (case 3)
227       split(numIdenticalTokenBytes, bytes);
228     }
229   }
230 
231 
232   protected void addChild(TokenizerNode node) {
233     node.setParent(this);
234     children.add(node);
235   }
236 
237 
238   /**
239    * Called when we need to convert a leaf node into a branch with 2 leaves. Comments inside the
240    * method assume we have token BAA starting at tokenStartOffset=0 and are adding BOO. The output
241    * will be 3 nodes:<br>
242    * <ul>
243    * <li>1: B &lt;- branch
244    * <li>2: AA &lt;- leaf
245    * <li>3: OO &lt;- leaf
246    * </ul>
247    *
248    * @param numTokenBytesToRetain =&gt; 1 (the B)
249    * @param bytes =&gt; BOO
250    */
251   protected void split(int numTokenBytesToRetain, final ByteRange bytes) {
252     int childNodeDepth = nodeDepth;
253     int childTokenStartOffset = tokenStartOffset + numTokenBytesToRetain;
254 
255     //create leaf AA
256     TokenizerNode firstChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
257       token, numTokenBytesToRetain);
258     firstChild.setNumOccurrences(numOccurrences);// do before clearing this node's numOccurrences
259     token.setLength(numTokenBytesToRetain);//shorten current token from BAA to B
260     numOccurrences = 0;//current node is now a branch
261 
262     moveChildrenToDifferentParent(firstChild);//point the new leaf (AA) to the new branch (B)
263     addChild(firstChild);//add the new leaf (AA) to the branch's (B's) children
264 
265     //create leaf OO
266     TokenizerNode secondChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
267       bytes, tokenStartOffset + numTokenBytesToRetain);
268     addChild(secondChild);//add the new leaf (00) to the branch's (B's) children
269 
270     // we inserted branch node B as a new level above/before the two children, so increment the
271     // depths of the children below
272     firstChild.incrementNodeDepthRecursively();
273     secondChild.incrementNodeDepthRecursively();
274   }
275 
276 
277   protected void incrementNodeDepthRecursively() {
278     ++nodeDepth;
279     builder.submitMaxNodeDepthCandidate(nodeDepth);
280     for (int i = 0; i < children.size(); ++i) {
281       children.get(i).incrementNodeDepthRecursively();
282     }
283   }
284 
285 
286   protected void moveChildrenToDifferentParent(TokenizerNode newParent) {
287     for (int i = 0; i < children.size(); ++i) {
288       TokenizerNode child = children.get(i);
289       child.setParent(newParent);
290       newParent.children.add(child);
291     }
292     children.clear();
293   }
294 
295 
296   /************************ byte[] utils *************************/
297 
298   protected boolean partiallyMatchesToken(ByteRange bytes) {
299     return numIdenticalBytes(bytes) > 0;
300   }
301 
302   protected boolean matchesToken(ByteRange bytes) {
303     return numIdenticalBytes(bytes) == getTokenLength();
304   }
305 
306   protected int numIdenticalBytes(ByteRange bytes) {
307     return ByteRangeUtils.numEqualPrefixBytes(token, bytes, tokenStartOffset);
308   }
309 
310 
311   /***************** moving nodes around ************************/
312 
313   public void appendNodesToExternalList(List<TokenizerNode> appendTo, boolean includeNonLeaves,
314       boolean includeLeaves) {
315     if (includeNonLeaves && !isLeaf() || includeLeaves && isLeaf()) {
316       appendTo.add(this);
317     }
318     for (int i = 0; i < children.size(); ++i) {
319       TokenizerNode child = children.get(i);
320       child.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
321     }
322   }
323 
324   public int setInsertionIndexes(int nextIndex) {
325     int newNextIndex = nextIndex;
326     if (hasOccurrences()) {
327       setFirstInsertionIndex(nextIndex);
328       newNextIndex += numOccurrences;
329     }
330     for (int i = 0; i < children.size(); ++i) {
331       TokenizerNode child = children.get(i);
332       newNextIndex = child.setInsertionIndexes(newNextIndex);
333     }
334     return newNextIndex;
335   }
336 
337   public void appendOutputArrayOffsets(List<Integer> offsets) {
338     if (hasOccurrences()) {
339       offsets.add(outputArrayOffset);
340     }
341     for (int i = 0; i < children.size(); ++i) {
342       TokenizerNode child = children.get(i);
343       child.appendOutputArrayOffsets(offsets);
344     }
345   }
346 
347 
348   /***************** searching *********************************/
349 
350   /*
351    * Do a trie style search through the tokenizer.  One option for looking up families or qualifiers
352    * during encoding, but currently unused in favor of tracking this information as they are added.
353    *
354    * Keeping code pending further performance testing.
355    */
356   public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
357       int keyLength) {
358     int thisNodeDepthPlusLength = tokenStartOffset + token.getLength();
359 
360     // quick check if the key is shorter than this node (may not work for binary search)
361     if (CollectionUtils.isEmpty(children)) {
362       if (thisNodeDepthPlusLength < keyLength) {// ran out of bytes
363         resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
364         return;
365       }
366     }
367 
368     // all token bytes must match
369     for (int i = 0; i < token.getLength(); ++i) {
370       if (key[tokenStartOffset + keyOffset + i] != token.get(i)) {
371         // TODO return whether it's before or after so we can binary search
372         resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
373         return;
374       }
375     }
376 
377     if (thisNodeDepthPlusLength == keyLength && numOccurrences > 0) {
378       resultHolder.set(TokenizerRowSearchPosition.MATCH, this);// MATCH
379       return;
380     }
381 
382     if (CollectionUtils.notEmpty(children)) {
383       // TODO binary search the children
384       for (int i = 0; i < children.size(); ++i) {
385         TokenizerNode child = children.get(i);
386         child.getNode(resultHolder, key, keyOffset, keyLength);
387         if (resultHolder.isMatch()) {
388           return;
389         } else if (resultHolder.getDifference() == TokenizerRowSearchPosition.BEFORE) {
390           // passed it, so it doesn't exist
391           resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
392           return;
393         }
394         // key is still AFTER the current node, so continue searching
395       }
396     }
397 
398     // checked all children (or there were no children), and didn't find it
399     resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
400     return;
401   }
402 
403 
404   /****************** writing back to byte[]'s *************************/
405 
406   public byte[] getNewByteArray() {
407     byte[] arrayToFill = new byte[tokenStartOffset + token.getLength()];
408     fillInBytes(arrayToFill);
409     return arrayToFill;
410   }
411 
412   public void fillInBytes(byte[] arrayToFill) {
413     for (int i = 0; i < token.getLength(); ++i) {
414       arrayToFill[tokenStartOffset + i] = token.get(i);
415     }
416     if (parent != null) {
417       parent.fillInBytes(arrayToFill);
418     }
419   }
420 
421 
422   /************************** printing ***********************/
423 
424   @Override
425   public String toString() {
426     String s = "";
427     if (parent == null) {
428       s += "R ";
429     } else {
430       s += getBnlIndicator(false) + " " + Bytes.toString(parent.getNewByteArray());
431     }
432     s += "[" + Bytes.toString(token.deepCopyToNewArray()) + "]";
433     if (numOccurrences > 0) {
434       s += "x" + numOccurrences;
435     }
436     return s;
437   }
438 
439   public String getPaddedTokenAndOccurrenceString() {
440     StringBuilder sb = new StringBuilder();
441     sb.append(getBnlIndicator(true));
442     sb.append(Strings.padFront(numOccurrences + "", ' ', 3));
443     sb.append(Strings.padFront(nodeDepth + "", ' ', 3));
444     if (outputArrayOffset >= 0) {
445       sb.append(Strings.padFront(outputArrayOffset + "", ' ', 3));
446     }
447     sb.append("  ");
448     for (int i = 0; i < tokenStartOffset; ++i) {
449       sb.append(" ");
450     }
451     sb.append(Bytes.toString(token.deepCopyToNewArray()).replaceAll(" ", "_"));
452     return sb.toString();
453   }
454 
455   public String getBnlIndicator(boolean indent) {
456     if (indent) {
457       if (isNub()) {
458         return " N ";
459       }
460       return isBranch() ? "B  " : "  L";
461     }
462     if (isNub()) {
463       return "N";
464     }
465     return isBranch() ? "B" : "L";
466   }
467 
468 
469   /********************** count different node types ********************/
470 
471   public int getNumBranchNodesIncludingThisNode() {
472     if (isLeaf()) {
473       return 0;
474     }
475     int totalFromThisPlusChildren = isBranch() ? 1 : 0;
476     for (int i = 0; i < children.size(); ++i) {
477       TokenizerNode child = children.get(i);
478       totalFromThisPlusChildren += child.getNumBranchNodesIncludingThisNode();
479     }
480     return totalFromThisPlusChildren;
481   }
482 
483   public int getNumNubNodesIncludingThisNode() {
484     if (isLeaf()) {
485       return 0;
486     }
487     int totalFromThisPlusChildren = isNub() ? 1 : 0;
488     for (int i = 0; i < children.size(); ++i) {
489       TokenizerNode child = children.get(i);
490       totalFromThisPlusChildren += child.getNumNubNodesIncludingThisNode();
491     }
492     return totalFromThisPlusChildren;
493   }
494 
495   public int getNumLeafNodesIncludingThisNode() {
496     if (isLeaf()) {
497       return 1;
498     }
499     int totalFromChildren = 0;
500     for (int i = 0; i < children.size(); ++i) {
501       TokenizerNode child = children.get(i);
502       totalFromChildren += child.getNumLeafNodesIncludingThisNode();
503     }
504     return totalFromChildren;
505   }
506 
507 
508   /*********************** simple read-only methods *******************************/
509 
510   public int getNodeDepth() {
511     return nodeDepth;
512   }
513 
514   public int getTokenLength() {
515     return token.getLength();
516   }
517 
518   public boolean hasOccurrences() {
519     return numOccurrences > 0;
520   }
521 
522   public boolean isRoot() {
523     return this.parent == null;
524   }
525 
526   public int getNumChildren() {
527     return CollectionUtils.nullSafeSize(children);
528   }
529 
530   public TokenizerNode getLastChild() {
531     if (CollectionUtils.isEmpty(children)) {
532       return null;
533     }
534     return CollectionUtils.getLast(children);
535   }
536 
537   public boolean isLeaf() {
538     return CollectionUtils.isEmpty(children) && hasOccurrences();
539   }
540 
541   public boolean isBranch() {
542     return CollectionUtils.notEmpty(children) && !hasOccurrences();
543   }
544 
545   public boolean isNub() {
546     return CollectionUtils.notEmpty(children) && hasOccurrences();
547   }
548 
549 
550   /********************** simple mutation methods *************************/
551 
552   /**
553    * Each occurrence &gt; 1 indicates a repeat of the previous entry.
554    * This can be called directly by
555    * an external class without going through the process of detecting a repeat if it is a known
556    * repeat by some external mechanism.  PtEncoder uses this when adding cells to a row if it knows
557    * the new cells are part of the current row.
558    * @param d increment by this amount
559    */
560   public void incrementNumOccurrences(int d) {
561     numOccurrences += d;
562   }
563 
564 
565   /************************* autogenerated get/set ******************/
566 
567   public int getTokenOffset() {
568     return tokenStartOffset;
569   }
570 
571   public TokenizerNode getParent() {
572     return parent;
573   }
574 
575   public ByteRange getToken() {
576     return token;
577   }
578 
579   public int getNumOccurrences() {
580     return numOccurrences;
581   }
582 
583   public void setParent(TokenizerNode parent) {
584     this.parent = parent;
585   }
586 
587   public void setNumOccurrences(int numOccurrences) {
588     this.numOccurrences = numOccurrences;
589   }
590 
591   public ArrayList<TokenizerNode> getChildren() {
592     return children;
593   }
594 
595   public long getId() {
596     return id;
597   }
598 
599   public int getFirstInsertionIndex() {
600     return firstInsertionIndex;
601   }
602 
603   public void setFirstInsertionIndex(int firstInsertionIndex) {
604     this.firstInsertionIndex = firstInsertionIndex;
605   }
606 
607   public int getNegativeIndex() {
608     return negativeIndex;
609   }
610 
611   public void setNegativeIndex(int negativeIndex) {
612     this.negativeIndex = negativeIndex;
613   }
614 
615   public int getOutputArrayOffset() {
616     return outputArrayOffset;
617   }
618 
619   public void setOutputArrayOffset(int outputArrayOffset) {
620     this.outputArrayOffset = outputArrayOffset;
621   }
622 
623   public void setId(long id) {
624     this.id = id;
625   }
626 
627   public void setBuilder(Tokenizer builder) {
628     this.builder = builder;
629   }
630 
631   public void setTokenOffset(int tokenOffset) {
632     this.tokenStartOffset = tokenOffset;
633   }
634 
635   public void setToken(ByteRange token) {
636     this.token = token;
637   }
638 
639 }