001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.mapreduce;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertFalse;
022import static org.junit.Assert.assertNull;
023import static org.junit.Assert.assertTrue;
024import static org.junit.Assert.fail;
025
026import java.util.ArrayList;
027import org.apache.hadoop.hbase.HBaseClassTestRule;
028import org.apache.hadoop.hbase.HConstants;
029import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser;
030import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.BadTsvLineException;
031import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.ParsedLine;
032import org.apache.hadoop.hbase.testclassification.MapReduceTests;
033import org.apache.hadoop.hbase.testclassification.SmallTests;
034import org.apache.hadoop.hbase.util.Bytes;
035import org.apache.hadoop.hbase.util.Pair;
036import org.junit.ClassRule;
037import org.junit.Test;
038import org.junit.experimental.categories.Category;
039
040import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
041import org.apache.hbase.thirdparty.com.google.common.base.Splitter;
042import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
043
044/**
045 * Tests for {@link TsvParser}.
046 */
047@Category({MapReduceTests.class, SmallTests.class})
048public class TestImportTsvParser {
049  @ClassRule
050  public static final HBaseClassTestRule CLASS_RULE =
051      HBaseClassTestRule.forClass(TestImportTsvParser.class);
052
053  private void assertBytesEquals(byte[] a, byte[] b) {
054    assertEquals(Bytes.toStringBinary(a), Bytes.toStringBinary(b));
055  }
056
057  private void checkParsing(ParsedLine parsed, Iterable<String> expected) {
058    ArrayList<String> parsedCols = new ArrayList<>();
059    for (int i = 0; i < parsed.getColumnCount(); i++) {
060      parsedCols.add(Bytes.toString(parsed.getLineBytes(), parsed.getColumnOffset(i),
061          parsed.getColumnLength(i)));
062    }
063    if (!Iterables.elementsEqual(parsedCols, expected)) {
064      fail("Expected: " + Joiner.on(",").join(expected) + "\n" + "Got:"
065          + Joiner.on(",").join(parsedCols));
066    }
067  }
068
069  @Test
070  public void testTsvParserSpecParsing() {
071    TsvParser parser;
072
073    parser = new TsvParser("HBASE_ROW_KEY", "\t");
074    assertNull(parser.getFamily(0));
075    assertNull(parser.getQualifier(0));
076    assertEquals(0, parser.getRowKeyColumnIndex());
077    assertFalse(parser.hasTimestamp());
078
079    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1", "\t");
080    assertNull(parser.getFamily(0));
081    assertNull(parser.getQualifier(0));
082    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
083    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
084    assertEquals(0, parser.getRowKeyColumnIndex());
085    assertFalse(parser.hasTimestamp());
086
087    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,col1:scol2", "\t");
088    assertNull(parser.getFamily(0));
089    assertNull(parser.getQualifier(0));
090    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
091    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
092    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(2));
093    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(2));
094    assertEquals(0, parser.getRowKeyColumnIndex());
095    assertFalse(parser.hasTimestamp());
096
097    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2", "\t");
098    assertNull(parser.getFamily(0));
099    assertNull(parser.getQualifier(0));
100    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
101    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
102    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
103    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
104    assertEquals(0, parser.getRowKeyColumnIndex());
105    assertTrue(parser.hasTimestamp());
106    assertEquals(2, parser.getTimestampKeyColumnIndex());
107
108    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ATTRIBUTES_KEY",
109        "\t");
110    assertNull(parser.getFamily(0));
111    assertNull(parser.getQualifier(0));
112    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
113    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
114    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
115    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
116    assertEquals(0, parser.getRowKeyColumnIndex());
117    assertTrue(parser.hasTimestamp());
118    assertEquals(2, parser.getTimestampKeyColumnIndex());
119    assertEquals(4, parser.getAttributesKeyColumnIndex());
120
121    parser = new TsvParser("HBASE_ATTRIBUTES_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ROW_KEY",
122        "\t");
123    assertNull(parser.getFamily(0));
124    assertNull(parser.getQualifier(0));
125    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
126    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
127    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
128    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
129    assertEquals(4, parser.getRowKeyColumnIndex());
130    assertTrue(parser.hasTimestamp());
131    assertEquals(2, parser.getTimestampKeyColumnIndex());
132    assertEquals(0, parser.getAttributesKeyColumnIndex());
133  }
134
135  @Test
136  public void testTsvParser() throws BadTsvLineException {
137    TsvParser parser = new TsvParser("col_a,col_b:qual,HBASE_ROW_KEY,col_d", "\t");
138    assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(0));
139    assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(0));
140    assertBytesEquals(Bytes.toBytes("col_b"), parser.getFamily(1));
141    assertBytesEquals(Bytes.toBytes("qual"), parser.getQualifier(1));
142    assertNull(parser.getFamily(2));
143    assertNull(parser.getQualifier(2));
144    assertEquals(2, parser.getRowKeyColumnIndex());
145
146    assertEquals(TsvParser.DEFAULT_TIMESTAMP_COLUMN_INDEX, parser.getTimestampKeyColumnIndex());
147
148    byte[] line = Bytes.toBytes("val_a\tval_b\tval_c\tval_d");
149    ParsedLine parsed = parser.parse(line, line.length);
150    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
151  }
152
153  @Test
154  public void testTsvParserWithTimestamp() throws BadTsvLineException {
155    TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
156    assertNull(parser.getFamily(0));
157    assertNull(parser.getQualifier(0));
158    assertNull(parser.getFamily(1));
159    assertNull(parser.getQualifier(1));
160    assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(2));
161    assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(2));
162    assertEquals(0, parser.getRowKeyColumnIndex());
163    assertEquals(1, parser.getTimestampKeyColumnIndex());
164
165    byte[] line = Bytes.toBytes("rowkey\t1234\tval_a");
166    ParsedLine parsed = parser.parse(line, line.length);
167    assertEquals(1234L, parsed.getTimestamp(-1));
168    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
169  }
170
171  /**
172   * Test cases that throw BadTsvLineException
173   */
174  @Test(expected = BadTsvLineException.class)
175  public void testTsvParserBadTsvLineExcessiveColumns() throws BadTsvLineException {
176    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
177    byte[] line = Bytes.toBytes("val_a\tval_b\tval_c");
178    parser.parse(line, line.length);
179  }
180
181  @Test(expected = BadTsvLineException.class)
182  public void testTsvParserBadTsvLineZeroColumn() throws BadTsvLineException {
183    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
184    byte[] line = Bytes.toBytes("");
185    parser.parse(line, line.length);
186  }
187
188  @Test(expected = BadTsvLineException.class)
189  public void testTsvParserBadTsvLineOnlyKey() throws BadTsvLineException {
190    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
191    byte[] line = Bytes.toBytes("key_only");
192    parser.parse(line, line.length);
193  }
194
195  @Test(expected = BadTsvLineException.class)
196  public void testTsvParserBadTsvLineNoRowKey() throws BadTsvLineException {
197    TsvParser parser = new TsvParser("col_a,HBASE_ROW_KEY", "\t");
198    byte[] line = Bytes.toBytes("only_cola_data_and_no_row_key");
199    parser.parse(line, line.length);
200  }
201
202  @Test(expected = BadTsvLineException.class)
203  public void testTsvParserInvalidTimestamp() throws BadTsvLineException {
204    TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
205    assertEquals(1, parser.getTimestampKeyColumnIndex());
206    byte[] line = Bytes.toBytes("rowkey\ttimestamp\tval_a");
207    ParsedLine parsed = parser.parse(line, line.length);
208    assertEquals(-1, parsed.getTimestamp(-1));
209    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
210  }
211
212  @Test(expected = BadTsvLineException.class)
213  public void testTsvParserNoTimestampValue() throws BadTsvLineException {
214    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
215    assertEquals(2, parser.getTimestampKeyColumnIndex());
216    byte[] line = Bytes.toBytes("rowkey\tval_a");
217    parser.parse(line, line.length);
218  }
219
220  @Test
221  public void testTsvParserParseRowKey() throws BadTsvLineException {
222    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
223    assertEquals(0, parser.getRowKeyColumnIndex());
224    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234");
225    Pair<Integer, Integer> rowKeyOffsets = parser.parseRowKey(line, line.length);
226    assertEquals(0, rowKeyOffsets.getFirst().intValue());
227    assertEquals(6, rowKeyOffsets.getSecond().intValue());
228    try {
229      line = Bytes.toBytes("\t\tval_a\t1234");
230      parser.parseRowKey(line, line.length);
231      fail("Should get BadTsvLineException on empty rowkey.");
232    } catch (BadTsvLineException ignored) {
233    }
234
235    parser = new TsvParser("col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
236    assertEquals(1, parser.getRowKeyColumnIndex());
237    line = Bytes.toBytes("val_a\trowkey\t1234");
238    rowKeyOffsets = parser.parseRowKey(line, line.length);
239    assertEquals(6, rowKeyOffsets.getFirst().intValue());
240    assertEquals(6, rowKeyOffsets.getSecond().intValue());
241    try {
242      line = Bytes.toBytes("val_a");
243      rowKeyOffsets = parser.parseRowKey(line, line.length);
244      fail("Should get BadTsvLineException when number of columns less than rowkey position.");
245    } catch (BadTsvLineException ignored) {
246    }
247
248    parser = new TsvParser("col_a,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
249    assertEquals(2, parser.getRowKeyColumnIndex());
250    line = Bytes.toBytes("val_a\t1234\trowkey");
251    rowKeyOffsets = parser.parseRowKey(line, line.length);
252    assertEquals(11, rowKeyOffsets.getFirst().intValue());
253    assertEquals(6, rowKeyOffsets.getSecond().intValue());
254  }
255
256  @Test
257  public void testTsvParseAttributesKey() throws BadTsvLineException {
258    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY", "\t");
259    assertEquals(0, parser.getRowKeyColumnIndex());
260    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value");
261    ParsedLine parse = parser.parse(line, line.length);
262    assertEquals(18, parse.getAttributeKeyOffset());
263    assertEquals(3, parser.getAttributesKeyColumnIndex());
264    String[] attributes = parse.getIndividualAttributes();
265    assertEquals("key=>value", attributes[0]);
266    try {
267      line = Bytes.toBytes("rowkey\tval_a\t1234");
268      parser.parse(line, line.length);
269      fail("Should get BadTsvLineException on empty rowkey.");
270    } catch (BadTsvLineException ignored) {
271    }
272
273    parser = new TsvParser("HBASE_ATTRIBUTES_KEY,col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
274    assertEquals(2, parser.getRowKeyColumnIndex());
275    line = Bytes.toBytes("key=>value\tval_a\trowkey\t1234");
276    parse = parser.parse(line, line.length);
277    assertEquals(0, parse.getAttributeKeyOffset());
278    assertEquals(0, parser.getAttributesKeyColumnIndex());
279    attributes = parse.getIndividualAttributes();
280    assertEquals("key=>value", attributes[0]);
281    try {
282      line = Bytes.toBytes("val_a");
283      ParsedLine parse2 = parser.parse(line, line.length);
284      fail("Should get BadTsvLineException when number of columns less than rowkey position.");
285    } catch (BadTsvLineException ignored) {
286    }
287
288    parser = new TsvParser("col_a,HBASE_ATTRIBUTES_KEY,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
289    assertEquals(3, parser.getRowKeyColumnIndex());
290    line = Bytes.toBytes("val_a\tkey0=>value0,key1=>value1,key2=>value2\t1234\trowkey");
291    parse = parser.parse(line, line.length);
292    assertEquals(1, parser.getAttributesKeyColumnIndex());
293    assertEquals(6, parse.getAttributeKeyOffset());
294    String[] attr = parse.getIndividualAttributes();
295    int i = 0;
296    for (String str :  attr) {
297      assertEquals(("key" + i + "=>" + "value" + i), str);
298      i++;
299    }
300  }
301
302  @Test
303  public void testTsvParserWithCellVisibilityCol() throws BadTsvLineException {
304    TsvParser parser = new TsvParser(
305        "HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY,HBASE_CELL_VISIBILITY", "\t");
306    assertEquals(0, parser.getRowKeyColumnIndex());
307    assertEquals(4, parser.getCellVisibilityColumnIndex());
308    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value\tPRIVATE&SECRET");
309    ParsedLine parse = parser.parse(line, line.length);
310    assertEquals(18, parse.getAttributeKeyOffset());
311    assertEquals(3, parser.getAttributesKeyColumnIndex());
312    String[] attributes = parse.getIndividualAttributes();
313    assertEquals("key=>value", attributes[0]);
314    assertEquals(29, parse.getCellVisibilityColumnOffset());
315  }
316}