001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.mapreduce;
019
020import static org.junit.jupiter.api.Assertions.assertEquals;
021import static org.junit.jupiter.api.Assertions.assertFalse;
022import static org.junit.jupiter.api.Assertions.assertNull;
023import static org.junit.jupiter.api.Assertions.assertThrows;
024import static org.junit.jupiter.api.Assertions.assertTrue;
025import static org.junit.jupiter.api.Assertions.fail;
026
027import java.util.ArrayList;
028import org.apache.hadoop.hbase.HConstants;
029import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser;
030import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.BadTsvLineException;
031import org.apache.hadoop.hbase.mapreduce.ImportTsv.TsvParser.ParsedLine;
032import org.apache.hadoop.hbase.testclassification.MapReduceTests;
033import org.apache.hadoop.hbase.testclassification.SmallTests;
034import org.apache.hadoop.hbase.util.Bytes;
035import org.apache.hadoop.hbase.util.Pair;
036import org.junit.jupiter.api.Tag;
037import org.junit.jupiter.api.Test;
038
039import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
040import org.apache.hbase.thirdparty.com.google.common.base.Splitter;
041import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
042
043/**
044 * Tests for {@link TsvParser}.
045 */
046@Tag(MapReduceTests.TAG)
047@Tag(SmallTests.TAG)
048public class TestImportTsvParser {
049
050  private void assertBytesEquals(byte[] a, byte[] b) {
051    assertEquals(Bytes.toStringBinary(a), Bytes.toStringBinary(b));
052  }
053
054  private void checkParsing(ParsedLine parsed, Iterable<String> expected) {
055    ArrayList<String> parsedCols = new ArrayList<>();
056    for (int i = 0; i < parsed.getColumnCount(); i++) {
057      parsedCols.add(Bytes.toString(parsed.getLineBytes(), parsed.getColumnOffset(i),
058        parsed.getColumnLength(i)));
059    }
060    if (!Iterables.elementsEqual(parsedCols, expected)) {
061      fail("Expected: " + Joiner.on(",").join(expected) + "\n" + "Got:"
062        + Joiner.on(",").join(parsedCols));
063    }
064  }
065
066  @Test
067  public void testTsvParserSpecParsing() {
068    TsvParser parser;
069
070    parser = new TsvParser("HBASE_ROW_KEY", "\t");
071    assertNull(parser.getFamily(0));
072    assertNull(parser.getQualifier(0));
073    assertEquals(0, parser.getRowKeyColumnIndex());
074    assertFalse(parser.hasTimestamp());
075
076    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1", "\t");
077    assertNull(parser.getFamily(0));
078    assertNull(parser.getQualifier(0));
079    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
080    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
081    assertEquals(0, parser.getRowKeyColumnIndex());
082    assertFalse(parser.hasTimestamp());
083
084    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,col1:scol2", "\t");
085    assertNull(parser.getFamily(0));
086    assertNull(parser.getQualifier(0));
087    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
088    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
089    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(2));
090    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(2));
091    assertEquals(0, parser.getRowKeyColumnIndex());
092    assertFalse(parser.hasTimestamp());
093
094    parser = new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2", "\t");
095    assertNull(parser.getFamily(0));
096    assertNull(parser.getQualifier(0));
097    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
098    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
099    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
100    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
101    assertEquals(0, parser.getRowKeyColumnIndex());
102    assertTrue(parser.hasTimestamp());
103    assertEquals(2, parser.getTimestampKeyColumnIndex());
104
105    parser =
106      new TsvParser("HBASE_ROW_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ATTRIBUTES_KEY", "\t");
107    assertNull(parser.getFamily(0));
108    assertNull(parser.getQualifier(0));
109    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
110    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
111    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
112    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
113    assertEquals(0, parser.getRowKeyColumnIndex());
114    assertTrue(parser.hasTimestamp());
115    assertEquals(2, parser.getTimestampKeyColumnIndex());
116    assertEquals(4, parser.getAttributesKeyColumnIndex());
117
118    parser =
119      new TsvParser("HBASE_ATTRIBUTES_KEY,col1:scol1,HBASE_TS_KEY,col1:scol2,HBASE_ROW_KEY", "\t");
120    assertNull(parser.getFamily(0));
121    assertNull(parser.getQualifier(0));
122    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(1));
123    assertBytesEquals(Bytes.toBytes("scol1"), parser.getQualifier(1));
124    assertBytesEquals(Bytes.toBytes("col1"), parser.getFamily(3));
125    assertBytesEquals(Bytes.toBytes("scol2"), parser.getQualifier(3));
126    assertEquals(4, parser.getRowKeyColumnIndex());
127    assertTrue(parser.hasTimestamp());
128    assertEquals(2, parser.getTimestampKeyColumnIndex());
129    assertEquals(0, parser.getAttributesKeyColumnIndex());
130  }
131
132  @Test
133  public void testTsvParser() throws BadTsvLineException {
134    TsvParser parser = new TsvParser("col_a,col_b:qual,HBASE_ROW_KEY,col_d", "\t");
135    assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(0));
136    assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(0));
137    assertBytesEquals(Bytes.toBytes("col_b"), parser.getFamily(1));
138    assertBytesEquals(Bytes.toBytes("qual"), parser.getQualifier(1));
139    assertNull(parser.getFamily(2));
140    assertNull(parser.getQualifier(2));
141    assertEquals(2, parser.getRowKeyColumnIndex());
142
143    assertEquals(TsvParser.DEFAULT_TIMESTAMP_COLUMN_INDEX, parser.getTimestampKeyColumnIndex());
144
145    byte[] line = Bytes.toBytes("val_a\tval_b\tval_c\tval_d");
146    ParsedLine parsed = parser.parse(line, line.length);
147    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
148  }
149
150  @Test
151  public void testTsvParserWithTimestamp() throws BadTsvLineException {
152    TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
153    assertNull(parser.getFamily(0));
154    assertNull(parser.getQualifier(0));
155    assertNull(parser.getFamily(1));
156    assertNull(parser.getQualifier(1));
157    assertBytesEquals(Bytes.toBytes("col_a"), parser.getFamily(2));
158    assertBytesEquals(HConstants.EMPTY_BYTE_ARRAY, parser.getQualifier(2));
159    assertEquals(0, parser.getRowKeyColumnIndex());
160    assertEquals(1, parser.getTimestampKeyColumnIndex());
161
162    byte[] line = Bytes.toBytes("rowkey\t1234\tval_a");
163    ParsedLine parsed = parser.parse(line, line.length);
164    assertEquals(1234L, parsed.getTimestamp(-1));
165    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
166  }
167
168  /**
169   * Test cases that throw BadTsvLineException
170   */
171  @Test
172  public void testTsvParserBadTsvLineExcessiveColumns() throws BadTsvLineException {
173    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
174    byte[] line = Bytes.toBytes("val_a\tval_b\tval_c");
175    assertThrows(BadTsvLineException.class, () -> parser.parse(line, line.length));
176  }
177
178  @Test
179  public void testTsvParserBadTsvLineZeroColumn() throws BadTsvLineException {
180    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
181    byte[] line = Bytes.toBytes("");
182    assertThrows(BadTsvLineException.class, () -> parser.parse(line, line.length));
183  }
184
185  @Test
186  public void testTsvParserBadTsvLineOnlyKey() throws BadTsvLineException {
187    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a", "\t");
188    byte[] line = Bytes.toBytes("key_only");
189    assertThrows(BadTsvLineException.class, () -> parser.parse(line, line.length));
190  }
191
192  @Test
193  public void testTsvParserBadTsvLineNoRowKey() throws BadTsvLineException {
194    TsvParser parser = new TsvParser("col_a,HBASE_ROW_KEY", "\t");
195    byte[] line = Bytes.toBytes("only_cola_data_and_no_row_key");
196    assertThrows(BadTsvLineException.class, () -> parser.parse(line, line.length));
197  }
198
199  @Test
200  public void testTsvParserInvalidTimestamp() throws BadTsvLineException {
201    TsvParser parser = new TsvParser("HBASE_ROW_KEY,HBASE_TS_KEY,col_a,", "\t");
202    assertEquals(1, parser.getTimestampKeyColumnIndex());
203    byte[] line = Bytes.toBytes("rowkey\ttimestamp\tval_a");
204    ParsedLine parsed = parser.parse(line, line.length);
205    assertThrows(BadTsvLineException.class, () -> parsed.getTimestamp(-1));
206    checkParsing(parsed, Splitter.on("\t").split(Bytes.toString(line)));
207  }
208
209  @Test
210  public void testTsvParserNoTimestampValue() throws BadTsvLineException {
211    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
212    assertEquals(2, parser.getTimestampKeyColumnIndex());
213    byte[] line = Bytes.toBytes("rowkey\tval_a");
214    assertThrows(BadTsvLineException.class, () -> parser.parse(line, line.length));
215  }
216
217  @Test
218  public void testTsvParserParseRowKey() throws BadTsvLineException {
219    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY", "\t");
220    assertEquals(0, parser.getRowKeyColumnIndex());
221    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234");
222    Pair<Integer, Integer> rowKeyOffsets = parser.parseRowKey(line, line.length);
223    assertEquals(0, rowKeyOffsets.getFirst().intValue());
224    assertEquals(6, rowKeyOffsets.getSecond().intValue());
225
226    byte[] line2 = Bytes.toBytes("\t\tval_a\t1234");
227    assertThrows(BadTsvLineException.class, () -> parser.parseRowKey(line2, line2.length));
228
229    TsvParser parser2 = new TsvParser("col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
230    assertEquals(1, parser2.getRowKeyColumnIndex());
231    line = Bytes.toBytes("val_a\trowkey\t1234");
232    rowKeyOffsets = parser2.parseRowKey(line, line.length);
233    assertEquals(6, rowKeyOffsets.getFirst().intValue());
234    assertEquals(6, rowKeyOffsets.getSecond().intValue());
235
236    byte[] line3 = Bytes.toBytes("val_a");
237    assertThrows(BadTsvLineException.class, () -> parser2.parseRowKey(line3, line3.length));
238
239    TsvParser parser3 = new TsvParser("col_a,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
240    assertEquals(2, parser3.getRowKeyColumnIndex());
241    line = Bytes.toBytes("val_a\t1234\trowkey");
242    rowKeyOffsets = parser3.parseRowKey(line, line.length);
243    assertEquals(11, rowKeyOffsets.getFirst().intValue());
244    assertEquals(6, rowKeyOffsets.getSecond().intValue());
245  }
246
247  @Test
248  public void testTsvParseAttributesKey() throws BadTsvLineException {
249    TsvParser parser = new TsvParser("HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY", "\t");
250    assertEquals(0, parser.getRowKeyColumnIndex());
251    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value");
252    ParsedLine parse = parser.parse(line, line.length);
253    assertEquals(18, parse.getAttributeKeyOffset());
254    assertEquals(3, parser.getAttributesKeyColumnIndex());
255    String[] attributes = parse.getIndividualAttributes();
256    assertEquals("key=>value", attributes[0]);
257    byte[] line2 = Bytes.toBytes("rowkey\tval_a\t1234");
258    TsvParser finalParser = parser;
259    assertThrows(BadTsvLineException.class, () -> finalParser.parse(line2, line2.length));
260
261    TsvParser parser2 =
262      new TsvParser("HBASE_ATTRIBUTES_KEY,col_a,HBASE_ROW_KEY,HBASE_TS_KEY", "\t");
263    assertEquals(2, parser2.getRowKeyColumnIndex());
264    line = Bytes.toBytes("key=>value\tval_a\trowkey\t1234");
265    parse = parser2.parse(line, line.length);
266    assertEquals(0, parse.getAttributeKeyOffset());
267    assertEquals(0, parser2.getAttributesKeyColumnIndex());
268    attributes = parse.getIndividualAttributes();
269    assertEquals("key=>value", attributes[0]);
270    byte[] line3 = Bytes.toBytes("val_a");
271    assertThrows(BadTsvLineException.class, () -> parser2.parse(line3, line3.length));
272
273    parser = new TsvParser("col_a,HBASE_ATTRIBUTES_KEY,HBASE_TS_KEY,HBASE_ROW_KEY", "\t");
274    assertEquals(3, parser.getRowKeyColumnIndex());
275    line = Bytes.toBytes("val_a\tkey0=>value0,key1=>value1,key2=>value2\t1234\trowkey");
276    parse = parser.parse(line, line.length);
277    assertEquals(1, parser.getAttributesKeyColumnIndex());
278    assertEquals(6, parse.getAttributeKeyOffset());
279    String[] attr = parse.getIndividualAttributes();
280    int i = 0;
281    for (String str : attr) {
282      assertEquals(("key" + i + "=>" + "value" + i), str);
283      i++;
284    }
285  }
286
287  @Test
288  public void testTsvParserWithCellVisibilityCol() throws BadTsvLineException {
289    TsvParser parser = new TsvParser(
290      "HBASE_ROW_KEY,col_a,HBASE_TS_KEY,HBASE_ATTRIBUTES_KEY,HBASE_CELL_VISIBILITY", "\t");
291    assertEquals(0, parser.getRowKeyColumnIndex());
292    assertEquals(4, parser.getCellVisibilityColumnIndex());
293    byte[] line = Bytes.toBytes("rowkey\tval_a\t1234\tkey=>value\tPRIVATE&SECRET");
294    ParsedLine parse = parser.parse(line, line.length);
295    assertEquals(18, parse.getAttributeKeyOffset());
296    assertEquals(3, parser.getAttributesKeyColumnIndex());
297    String[] attributes = parse.getIndividualAttributes();
298    assertEquals("key=>value", attributes[0]);
299    assertEquals(29, parse.getCellVisibilityColumnOffset());
300  }
301}