View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize;
20  
21  import java.util.ArrayList;
22  import java.util.List;
23  
24  import org.apache.hadoop.hbase.classification.InterfaceAudience;
25  import org.apache.hadoop.hbase.util.ArrayUtils;
26  import org.apache.hadoop.hbase.util.ByteRange;
27  import org.apache.hadoop.hbase.util.Bytes;
28  import org.apache.hadoop.hbase.util.CollectionUtils;
29  
30  import com.google.common.collect.Lists;
31  
32  /**
33   * Data structure used in the first stage of PrefixTree encoding:
34   * <ul>
35   * <li>accepts a sorted stream of ByteRanges
36   * <li>splits them into a set of tokens, each held by a {@link TokenizerNode}
37   * <li>connects the TokenizerNodes via standard java references
38   * <li>keeps a pool of TokenizerNodes and a reusable byte[] for holding all token content
39   * </ul>
40   * <p><br>
41   * Mainly used for turning Cell rowKeys into a trie, but also used for family and qualifier
42   * encoding.
43   */
44  @InterfaceAudience.Private
45  public class Tokenizer{
46  
47    /***************** fields **************************/
48  
49    protected int numArraysAdded = 0;
50    protected long lastNodeId = -1;
51    protected ArrayList<TokenizerNode> nodes;
52    protected int numNodes;
53    protected TokenizerNode root;
54    protected byte[] tokens;
55    protected int tokensLength;
56  
57    protected int maxElementLength = 0;
58    // number of levels in the tree assuming root level is 0
59    protected int treeDepth = 0;
60  
61  
62    /******************* construct *******************/
63  
64    public Tokenizer() {
65      this.nodes = Lists.newArrayList();
66      this.tokens = new byte[0];
67    }
68  
69    public void reset() {
70      numArraysAdded = 0;
71      lastNodeId = -1;
72      numNodes = 0;
73      tokensLength = 0;
74      root = null;
75      maxElementLength = 0;
76      treeDepth = 0;
77    }
78  
79  
80    /***************** building *************************/
81  
82    public void addAll(ArrayList<ByteRange> sortedByteRanges) {
83      for (int i = 0; i < sortedByteRanges.size(); ++i) {
84        ByteRange byteRange = sortedByteRanges.get(i);
85        addSorted(byteRange);
86      }
87    }
88  
89    public void addSorted(final ByteRange bytes) {
90      ++numArraysAdded;
91      if (bytes.getLength() > maxElementLength) {
92        maxElementLength = bytes.getLength();
93      }
94      if (root == null) {
95        // nodeDepth of firstNode (non-root) is 1
96        root = addNode(null, 1, 0, bytes, 0);
97      } else {
98        root.addSorted(bytes);
99      }
100   }
101 
102   public void incrementNumOccurrencesOfLatestValue(){
103     CollectionUtils.getLast(nodes).incrementNumOccurrences(1);
104   }
105 
106   protected long nextNodeId() {
107     return ++lastNodeId;
108   }
109 
110   protected TokenizerNode addNode(TokenizerNode parent, int nodeDepth, int tokenStartOffset,
111       final ByteRange token, int inputTokenOffset) {
112     int inputTokenLength = token.getLength() - inputTokenOffset;
113     int tokenOffset = appendTokenAndRepointByteRange(token, inputTokenOffset);
114     TokenizerNode node = null;
115     if (nodes.size() <= numNodes) {
116       node = new TokenizerNode(this, parent, nodeDepth, tokenStartOffset, tokenOffset,
117           inputTokenLength);
118       nodes.add(node);
119     } else {
120       node = nodes.get(numNodes);
121       node.reset();
122       node.reconstruct(this, parent, nodeDepth, tokenStartOffset, tokenOffset, inputTokenLength);
123     }
124     ++numNodes;
125     return node;
126   }
127 
128   protected int appendTokenAndRepointByteRange(final ByteRange token, int inputTokenOffset) {
129     int newOffset = tokensLength;
130     int inputTokenLength = token.getLength() - inputTokenOffset;
131     int newMinimum = tokensLength + inputTokenLength;
132     tokens = ArrayUtils.growIfNecessary(tokens, newMinimum, 2 * newMinimum);
133     token.deepCopySubRangeTo(inputTokenOffset, inputTokenLength, tokens, tokensLength);
134     tokensLength += inputTokenLength;
135     return newOffset;
136   }
137 
138   protected void submitMaxNodeDepthCandidate(int nodeDepth) {
139     if (nodeDepth > treeDepth) {
140       treeDepth = nodeDepth;
141     }
142   }
143 
144 
145   /********************* read ********************/
146 
147   public int getNumAdded(){
148     return numArraysAdded;
149   }
150 
151   // for debugging
152   public ArrayList<TokenizerNode> getNodes(boolean includeNonLeaves, boolean includeLeaves) {
153     ArrayList<TokenizerNode> nodes = Lists.newArrayList();
154     root.appendNodesToExternalList(nodes, includeNonLeaves, includeLeaves);
155     return nodes;
156   }
157 
158   public void appendNodes(List<TokenizerNode> appendTo, boolean includeNonLeaves,
159       boolean includeLeaves) {
160     root.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
161   }
162 
163   public List<byte[]> getArrays() {
164     List<TokenizerNode> nodes = new ArrayList<TokenizerNode>();
165     root.appendNodesToExternalList(nodes, true, true);
166     List<byte[]> byteArrays = Lists.newArrayListWithCapacity(CollectionUtils.nullSafeSize(nodes));
167     for (int i = 0; i < nodes.size(); ++i) {
168       TokenizerNode node = nodes.get(i);
169       for (int j = 0; j < node.getNumOccurrences(); ++j) {
170         byte[] byteArray = node.getNewByteArray();
171         byteArrays.add(byteArray);
172       }
173     }
174     return byteArrays;
175   }
176 
177   //currently unused, but working and possibly useful in the future
178   public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
179       int keyLength) {
180     root.getNode(resultHolder, key, keyOffset, keyLength);
181   }
182 
183 
184   /********************** write ***************************/
185 
186   public Tokenizer setNodeFirstInsertionIndexes() {
187     root.setInsertionIndexes(0);
188     return this;
189   }
190 
191   public Tokenizer appendOutputArrayOffsets(List<Integer> offsets) {
192     root.appendOutputArrayOffsets(offsets);
193     return this;
194   }
195 
196 
197   /********************* print/debug ********************/
198 
199   protected static final Boolean INCLUDE_FULL_TREE_IN_TO_STRING = false;
200 
201   @Override
202   public String toString() {
203     StringBuilder sb = new StringBuilder();
204     sb.append(getStructuralString());
205     if (INCLUDE_FULL_TREE_IN_TO_STRING) {
206       for (byte[] bytes : getArrays()) {
207         if (sb.length() > 0) {
208           sb.append("\n");
209         }
210         sb.append(Bytes.toString(bytes));
211       }
212     }
213     return sb.toString();
214   }
215 
216   public String getStructuralString() {
217     List<TokenizerNode> nodes = getNodes(true, true);
218     StringBuilder sb = new StringBuilder();
219     for (TokenizerNode node : nodes) {
220       String line = node.getPaddedTokenAndOccurrenceString();
221       sb.append(line + "\n");
222     }
223     return sb.toString();
224   }
225 
226 
227   /****************** get/set ************************/
228 
229   public TokenizerNode getRoot() {
230     return root;
231   }
232 
233   public int getMaxElementLength() {
234     return maxElementLength;
235   }
236 
237   public int getTreeDepth() {
238     return treeDepth;
239   }
240 
241 }