1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize;
20
21 import java.util.ArrayList;
22 import java.util.List;
23
24 import org.apache.hadoop.hbase.classification.InterfaceAudience;
25 import org.apache.hadoop.hbase.util.ArrayUtils;
26 import org.apache.hadoop.hbase.util.ByteRange;
27 import org.apache.hadoop.hbase.util.Bytes;
28 import org.apache.hadoop.hbase.util.CollectionUtils;
29
30 import com.google.common.collect.Lists;
31
32
33
34
35
36
37
38
39
40
41
42
43
44 @InterfaceAudience.Private
45 public class Tokenizer{
46
47
48
49 protected int numArraysAdded = 0;
50 protected long lastNodeId = -1;
51 protected ArrayList<TokenizerNode> nodes;
52 protected int numNodes;
53 protected TokenizerNode root;
54 protected byte[] tokens;
55 protected int tokensLength;
56
57 protected int maxElementLength = 0;
58
59 protected int treeDepth = 0;
60
61
62
63
64 public Tokenizer() {
65 this.nodes = Lists.newArrayList();
66 this.tokens = new byte[0];
67 }
68
69 public void reset() {
70 numArraysAdded = 0;
71 lastNodeId = -1;
72 numNodes = 0;
73 tokensLength = 0;
74 root = null;
75 maxElementLength = 0;
76 treeDepth = 0;
77 }
78
79
80
81
82 public void addAll(ArrayList<ByteRange> sortedByteRanges) {
83 for (int i = 0; i < sortedByteRanges.size(); ++i) {
84 ByteRange byteRange = sortedByteRanges.get(i);
85 addSorted(byteRange);
86 }
87 }
88
89 public void addSorted(final ByteRange bytes) {
90 ++numArraysAdded;
91 if (bytes.getLength() > maxElementLength) {
92 maxElementLength = bytes.getLength();
93 }
94 if (root == null) {
95
96 root = addNode(null, 1, 0, bytes, 0);
97 } else {
98 root.addSorted(bytes);
99 }
100 }
101
102 public void incrementNumOccurrencesOfLatestValue(){
103 CollectionUtils.getLast(nodes).incrementNumOccurrences(1);
104 }
105
106 protected long nextNodeId() {
107 return ++lastNodeId;
108 }
109
110 protected TokenizerNode addNode(TokenizerNode parent, int nodeDepth, int tokenStartOffset,
111 final ByteRange token, int inputTokenOffset) {
112 int inputTokenLength = token.getLength() - inputTokenOffset;
113 int tokenOffset = appendTokenAndRepointByteRange(token, inputTokenOffset);
114 TokenizerNode node = null;
115 if (nodes.size() <= numNodes) {
116 node = new TokenizerNode(this, parent, nodeDepth, tokenStartOffset, tokenOffset,
117 inputTokenLength);
118 nodes.add(node);
119 } else {
120 node = nodes.get(numNodes);
121 node.reset();
122 node.reconstruct(this, parent, nodeDepth, tokenStartOffset, tokenOffset, inputTokenLength);
123 }
124 ++numNodes;
125 return node;
126 }
127
128 protected int appendTokenAndRepointByteRange(final ByteRange token, int inputTokenOffset) {
129 int newOffset = tokensLength;
130 int inputTokenLength = token.getLength() - inputTokenOffset;
131 int newMinimum = tokensLength + inputTokenLength;
132 tokens = ArrayUtils.growIfNecessary(tokens, newMinimum, 2 * newMinimum);
133 token.deepCopySubRangeTo(inputTokenOffset, inputTokenLength, tokens, tokensLength);
134 tokensLength += inputTokenLength;
135 return newOffset;
136 }
137
138 protected void submitMaxNodeDepthCandidate(int nodeDepth) {
139 if (nodeDepth > treeDepth) {
140 treeDepth = nodeDepth;
141 }
142 }
143
144
145
146
147 public int getNumAdded(){
148 return numArraysAdded;
149 }
150
151
152 public ArrayList<TokenizerNode> getNodes(boolean includeNonLeaves, boolean includeLeaves) {
153 ArrayList<TokenizerNode> nodes = Lists.newArrayList();
154 root.appendNodesToExternalList(nodes, includeNonLeaves, includeLeaves);
155 return nodes;
156 }
157
158 public void appendNodes(List<TokenizerNode> appendTo, boolean includeNonLeaves,
159 boolean includeLeaves) {
160 root.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
161 }
162
163 public List<byte[]> getArrays() {
164 List<TokenizerNode> nodes = new ArrayList<TokenizerNode>();
165 root.appendNodesToExternalList(nodes, true, true);
166 List<byte[]> byteArrays = Lists.newArrayListWithCapacity(CollectionUtils.nullSafeSize(nodes));
167 for (int i = 0; i < nodes.size(); ++i) {
168 TokenizerNode node = nodes.get(i);
169 for (int j = 0; j < node.getNumOccurrences(); ++j) {
170 byte[] byteArray = node.getNewByteArray();
171 byteArrays.add(byteArray);
172 }
173 }
174 return byteArrays;
175 }
176
177
178 public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
179 int keyLength) {
180 root.getNode(resultHolder, key, keyOffset, keyLength);
181 }
182
183
184
185
186 public Tokenizer setNodeFirstInsertionIndexes() {
187 root.setInsertionIndexes(0);
188 return this;
189 }
190
191 public Tokenizer appendOutputArrayOffsets(List<Integer> offsets) {
192 root.appendOutputArrayOffsets(offsets);
193 return this;
194 }
195
196
197
198
199 protected static final Boolean INCLUDE_FULL_TREE_IN_TO_STRING = false;
200
201 @Override
202 public String toString() {
203 StringBuilder sb = new StringBuilder();
204 sb.append(getStructuralString());
205 if (INCLUDE_FULL_TREE_IN_TO_STRING) {
206 for (byte[] bytes : getArrays()) {
207 if (sb.length() > 0) {
208 sb.append("\n");
209 }
210 sb.append(Bytes.toString(bytes));
211 }
212 }
213 return sb.toString();
214 }
215
216 public String getStructuralString() {
217 List<TokenizerNode> nodes = getNodes(true, true);
218 StringBuilder sb = new StringBuilder();
219 for (TokenizerNode node : nodes) {
220 String line = node.getPaddedTokenAndOccurrenceString();
221 sb.append(line + "\n");
222 }
223 return sb.toString();
224 }
225
226
227
228
229 public TokenizerNode getRoot() {
230 return root;
231 }
232
233 public int getMaxElementLength() {
234 return maxElementLength;
235 }
236
237 public int getTreeDepth() {
238 return treeDepth;
239 }
240
241 }