001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.mapreduce;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertFalse;
022import static org.junit.Assert.assertNull;
023import static org.junit.Assert.assertTrue;
024import static org.junit.Assert.fail;
025
026import java.util.ArrayList;
027import org.apache.hadoop.hbase.HBaseClassTestRule;
028import org.apache.hadoop.hbase.HConstants;
029import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser;
030import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.BadTsvLineException;
031import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.ParsedLine;
032import org.apache.hadoop.hbase.testclassification.MapReduceTests;
033import org.apache.hadoop.hbase.testclassification.SmallTests;
034import org.apache.hadoop.hbase.util.Bytes;
035import org.apache.hadoop.hbase.util.Pair;
036import org.junit.ClassRule;
037import org.junit.Test;
038import org.junit.experimental.categories.Category;
039
040import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
041import org.apache.hbase.thirdparty.com.google.common.base.Splitter;
042import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
043
044/**
045 * Tests for {@link TsvParser}.
046 */
047@Category({MapReduceTests.class, SmallTests.class})
048public class TestImportTsvParser {
049
050  @ClassRule
051  public static final HBaseClassTestRule CLASS_RULE =
052      HBaseClassTestRule.forClass(TestImportTsvParser.class);
053
054  private void assertBytesEquals(byte[] a, byte[] b) {
055    assertEquals(Bytes.toStringBinary(a), Bytes.toStringBinary(b));
056  }
057
058  private void checkParsing(ParsedLine parsed, Iterable<String> expected) {
059    ArrayList<String> parsedCols = new ArrayList<>();
060    for (int i = 0; i < parsed.getColumnCount(); i++) {
061      parsedCols.add(Bytes.toString(parsed.getLineBytes(), parsed.getColumnOffset(i),
062          parsed.getColumnLength(i)));
063    }
064    if (!Iterables.elementsEqual(parsedCols, expected)) {
065      fail("Expected: " + Joiner.on(",").join(expected) + "\n" + "Got:"
066          + Joiner.on(",").join(parsedCols));
067    }
068  }
069
070  @Test
071  public void testTsvParserSpecParsing() {
072    TsvParser parser;
073
074    parser = new TsvParser("HBASE_ROW_KEY", "\t");
075    assertNull(parser.getFamily(0));
076    assertNull(parser.getQualifier(0));
077    assertEquals(0, parser.getRowKeyColumnIndex());
078    assertFalse(parser.hasTimestamp());
079
080    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1", "\t");
081    assertNull(parser.getFamily(0));
082    assertNull(parser.getQualifier(0));
083    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
084    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
085    assertEquals(0, parser.getRowKeyColumnIndex());
086    assertFalse(parser.hasTimestamp());
087
088    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,col1:scol2", "\t");
089    assertNull(parser.getFamily(0));
090    assertNull(parser.getQualifier(0));
091    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
092    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
093    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(2));
094    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(2));
095    assertEquals(0, parser.getRowKeyColumnIndex());
096    assertFalse(parser.hasTimestamp());
097
098    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2", "\t");
099    assertNull(parser.getFamily(0));
100    assertNull(parser.getQualifier(0));
101    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
102    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
103    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
104    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
105    assertEquals(0, parser.getRowKeyColumnIndex());
106    assertTrue(parser.hasTimestamp());
107    assertEquals(2, parser.getTimestampKeyColumnIndex());
108
109    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ATTRIBUTES_KEY",
110        "\t");
111    assertNull(parser.getFamily(0));
112    assertNull(parser.getQualifier(0));
113    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
114    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
115    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
116    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
117    assertEquals(0, parser.getRowKeyColumnIndex());
118    assertTrue(parser.hasTimestamp());
119    assertEquals(2, parser.getTimestampKeyColumnIndex());
120    assertEquals(4, parser.getAttributesKeyColumnIndex());
121
122    parser = new TsvParser("HBASE_ATTRIBUTES_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ROW_KEY",
123        "\t");
124    assertNull(parser.getFamily(0));
125    assertNull(parser.getQualifier(0));
126    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
127    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
128    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
129    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
130    assertEquals(4, parser.getRowKeyColumnIndex());
131    assertTrue(parser.hasTimestamp());
132    assertEquals(2, parser.getTimestampKeyColumnIndex());
133    assertEquals(0, parser.getAttributesKeyColumnIndex());
134  }
135
136  @Test
137  public void testTsvParser() throws BadTsvLineException {
138    TsvParser parser = new TsvParser("col_a,col_b:qual,HBASE_ROW_KEY,col_d", "\t");
139    assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(0));
140    assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(0));
141    assertBytesEquals(Bytes.toBytes("col_b"), parser.getFamily(1));
142    assertBytesEquals(Bytes.toBytes("qual"), parser.getQualifier(1));
143    assertNull(parser.getFamily(2));
144    assertNull(parser.getQualifier(2));
145    assertEquals(2, parser.getRowKeyColumnIndex());
146
147    assertEquals(TsvParser.DEFAULT_TIMESTAMP_COLUMN_INDEX, parser.getTimestampKeyColumnIndex());
148
149    byte[] line = Bytes.toBytes("val_a\tval_b\tval_c\tval_d");
150    ParsedLine parsed = parser.parse(line, line.length);
151    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
152  }
153
154  @Test
155  public void testTsvParserWithTimestamp() throws BadTsvLineException {
156    TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
157    assertNull(parser.getFamily(0));
158    assertNull(parser.getQualifier(0));
159    assertNull(parser.getFamily(1));
160    assertNull(parser.getQualifier(1));
161    assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(2));
162    assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(2));
163    assertEquals(0, parser.getRowKeyColumnIndex());
164    assertEquals(1, parser.getTimestampKeyColumnIndex());
165
166    byte[] line = Bytes.toBytes("rowkey\t1234\tval_a");
167    ParsedLine parsed = parser.parse(line, line.length);
168    assertEquals(1234l, parsed.getTimestamp(-1));
169    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
170  }
171
172  /**
173   * Test cases that throw BadTsvLineException
174   */
175  @Test(expected = BadTsvLineException.class)
176  public void testTsvParserBadTsvLineExcessiveColumns() throws BadTsvLineException {
177    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
178    byte[] line = Bytes.toBytes("val_a\tval_b\tval_c");
179    parser.parse(line, line.length);
180  }
181
182  @Test(expected = BadTsvLineException.class)
183  public void testTsvParserBadTsvLineZeroColumn() throws BadTsvLineException {
184    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
185    byte[] line = Bytes.toBytes("");
186    parser.parse(line, line.length);
187  }
188
189  @Test(expected = BadTsvLineException.class)
190  public void testTsvParserBadTsvLineOnlyKey() throws BadTsvLineException {
191    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
192    byte[] line = Bytes.toBytes("key_only");
193    parser.parse(line, line.length);
194  }
195
196  @Test(expected = BadTsvLineException.class)
197  public void testTsvParserBadTsvLineNoRowKey() throws BadTsvLineException {
198    TsvParser parser = new TsvParser("col_a,HBASE_ROW_KEY", "\t");
199    byte[] line = Bytes.toBytes("only_cola_data_and_no_row_key");
200    parser.parse(line, line.length);
201  }
202
203  @Test(expected = BadTsvLineException.class)
204  public void testTsvParserInvalidTimestamp() throws BadTsvLineException {
205    TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
206    assertEquals(1, parser.getTimestampKeyColumnIndex());
207    byte[] line = Bytes.toBytes("rowkey\ttimestamp\tval_a");
208    ParsedLine parsed = parser.parse(line, line.length);
209    assertEquals(-1, parsed.getTimestamp(-1));
210    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
211  }
212
213  @Test(expected = BadTsvLineException.class)
214  public void testTsvParserNoTimestampValue() throws BadTsvLineException {
215    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
216    assertEquals(2, parser.getTimestampKeyColumnIndex());
217    byte[] line = Bytes.toBytes("rowkey\tval_a");
218    parser.parse(line, line.length);
219  }
220
221  @Test
222  public void testTsvParserParseRowKey() throws BadTsvLineException {
223    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
224    assertEquals(0, parser.getRowKeyColumnIndex());
225    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234");
226    Pair<Integer, Integer> rowKeyOffsets = parser.parseRowKey(line, line.length);
227    assertEquals(0, rowKeyOffsets.getFirst().intValue());
228    assertEquals(6, rowKeyOffsets.getSecond().intValue());
229    try {
230      line = Bytes.toBytes("\t\tval_a\t1234");
231      parser.parseRowKey(line, line.length);
232      fail("Should get BadTsvLineException on empty rowkey.");
233    } catch (BadTsvLineException b) {
234
235    }
236    parser = new TsvParser("col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
237    assertEquals(1, parser.getRowKeyColumnIndex());
238    line = Bytes.toBytes("val_a\trowkey\t1234");
239    rowKeyOffsets = parser.parseRowKey(line, line.length);
240    assertEquals(6, rowKeyOffsets.getFirst().intValue());
241    assertEquals(6, rowKeyOffsets.getSecond().intValue());
242    try {
243      line = Bytes.toBytes("val_a");
244      rowKeyOffsets = parser.parseRowKey(line, line.length);
245      fail("Should get BadTsvLineException when number of columns less than rowkey position.");
246    } catch (BadTsvLineException b) {
247
248    }
249    parser = new TsvParser("col_a,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
250    assertEquals(2, parser.getRowKeyColumnIndex());
251    line = Bytes.toBytes("val_a\t1234\trowkey");
252    rowKeyOffsets = parser.parseRowKey(line, line.length);
253    assertEquals(11, rowKeyOffsets.getFirst().intValue());
254    assertEquals(6, rowKeyOffsets.getSecond().intValue());
255  }
256
257  @Test
258  public void testTsvParseAttributesKey() throws BadTsvLineException {
259    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY", "\t");
260    assertEquals(0, parser.getRowKeyColumnIndex());
261    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value");
262    ParsedLine parse = parser.parse(line, line.length);
263    assertEquals(18, parse.getAttributeKeyOffset());
264    assertEquals(3, parser.getAttributesKeyColumnIndex());
265    String attributes[] = parse.getIndividualAttributes();
266    assertEquals(attributes[0], "key=>value");
267    try {
268      line = Bytes.toBytes("rowkey\tval_a\t1234");
269      parser.parse(line, line.length);
270      fail("Should get BadTsvLineException on empty rowkey.");
271    } catch (BadTsvLineException b) {
272
273    }
274    parser = new TsvParser("HBASE_ATTRIBUTES_KEY,col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
275    assertEquals(2, parser.getRowKeyColumnIndex());
276    line = Bytes.toBytes("key=>value\tval_a\trowkey\t1234");
277    parse = parser.parse(line, line.length);
278    assertEquals(0, parse.getAttributeKeyOffset());
279    assertEquals(0, parser.getAttributesKeyColumnIndex());
280    attributes = parse.getIndividualAttributes();
281    assertEquals(attributes[0], "key=>value");
282    try {
283      line = Bytes.toBytes("val_a");
284      ParsedLine parse2 = parser.parse(line, line.length);
285      fail("Should get BadTsvLineException when number of columns less than rowkey position.");
286    } catch (BadTsvLineException b) {
287
288    }
289    parser = new TsvParser("col_a,HBASE_ATTRIBUTES_KEY,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
290    assertEquals(3, parser.getRowKeyColumnIndex());
291    line = Bytes.toBytes("val_a\tkey0=>value0,key1=>value1,key2=>value2\t1234\trowkey");
292    parse = parser.parse(line, line.length);
293    assertEquals(1, parser.getAttributesKeyColumnIndex());
294    assertEquals(6, parse.getAttributeKeyOffset());
295    String[] attr = parse.getIndividualAttributes();
296    int i = 0;
297    for(String str :  attr) {
298      assertEquals(("key"+i+"=>"+"value"+i), str );
299      i++;
300    }
301  }
302
303  @Test
304  public void testTsvParserWithCellVisibilityCol() throws BadTsvLineException {
305    TsvParser parser = new TsvParser(
306        "HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY,HBASE_CELL_VISIBILITY", "\t");
307    assertEquals(0, parser.getRowKeyColumnIndex());
308    assertEquals(4, parser.getCellVisibilityColumnIndex());
309    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value\tPRIVATE&SECRET");
310    ParsedLine parse = parser.parse(line, line.length);
311    assertEquals(18, parse.getAttributeKeyOffset());
312    assertEquals(3, parser.getAttributesKeyColumnIndex());
313    String attributes[] = parse.getIndividualAttributes();
314    assertEquals(attributes[0], "key=>value");
315    assertEquals(29, parse.getCellVisibilityColumnOffset());
316  }
317
318}