1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.Closeable;
22 import java.io.IOException;
23 import java.net.InetAddress;
24 import java.net.InetSocketAddress;
25 import java.net.UnknownHostException;
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
29
30 import javax.naming.NamingException;
31
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.hadoop.conf.Configuration;
35 import org.apache.hadoop.hbase.classification.InterfaceAudience;
36 import org.apache.hadoop.hbase.classification.InterfaceStability;
37 import org.apache.hadoop.hbase.HConstants;
38 import org.apache.hadoop.hbase.HRegionLocation;
39 import org.apache.hadoop.hbase.TableName;
40 import org.apache.hadoop.hbase.client.Admin;
41 import org.apache.hadoop.hbase.client.Connection;
42 import org.apache.hadoop.hbase.client.ConnectionFactory;
43 import org.apache.hadoop.hbase.client.HTable;
44 import org.apache.hadoop.hbase.client.NeedUnmanagedConnectionException;
45 import org.apache.hadoop.hbase.client.RegionLocator;
46 import org.apache.hadoop.hbase.client.Result;
47 import org.apache.hadoop.hbase.client.Scan;
48 import org.apache.hadoop.hbase.client.Table;
49 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
50 import org.apache.hadoop.hbase.util.Addressing;
51 import org.apache.hadoop.hbase.util.Bytes;
52 import org.apache.hadoop.hbase.util.Pair;
53 import org.apache.hadoop.hbase.util.RegionSizeCalculator;
54 import org.apache.hadoop.hbase.util.Strings;
55 import org.apache.hadoop.mapreduce.InputFormat;
56 import org.apache.hadoop.mapreduce.InputSplit;
57 import org.apache.hadoop.mapreduce.JobContext;
58 import org.apache.hadoop.mapreduce.RecordReader;
59 import org.apache.hadoop.mapreduce.TaskAttemptContext;
60 import org.apache.hadoop.net.DNS;
61 import org.apache.hadoop.util.StringUtils;
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103 @InterfaceAudience.Public
104 @InterfaceStability.Stable
105 public abstract class TableInputFormatBase
106 extends InputFormat<ImmutableBytesWritable, Result> {
107
108
109 public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.input.autobalance";
110
111
112 public static final String INPUT_AUTOBALANCE_MAXSKEWRATIO = "hbase.mapreduce.input.autobalance" +
113 ".maxskewratio";
114
115
116 public static final String TABLE_ROW_TEXTKEY = "hbase.table.row.textkey";
117
118 final Log LOG = LogFactory.getLog(TableInputFormatBase.class);
119
120 private static final String NOT_INITIALIZED = "The input format instance has not been properly " +
121 "initialized. Ensure you call initializeTable either in your constructor or initialize " +
122 "method";
123 private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a" +
124 " previous error. Please look at the previous logs lines from" +
125 " the task's full log for more details.";
126
127
128
129
130 private Scan scan = null;
131
132 private Admin admin;
133
134 private Table table;
135
136 private RegionLocator regionLocator;
137
138 private TableRecordReader tableRecordReader = null;
139
140 private Connection connection;
141
142
143
144 private HashMap<InetAddress, String> reverseDNSCacheMap =
145 new HashMap<InetAddress, String>();
146
147
148
149
150
151
152
153
154
155
156
157
158
159 @Override
160 public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
161 InputSplit split, TaskAttemptContext context)
162 throws IOException {
163
164 if (table == null) {
165 initialize(context);
166 }
167
168 try {
169 if (getTable() == null) {
170
171 throw new IOException(INITIALIZATION_ERROR);
172 }
173 } catch (IllegalStateException exception) {
174 throw new IOException(INITIALIZATION_ERROR, exception);
175 }
176 TableSplit tSplit = (TableSplit) split;
177 LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
178 final TableRecordReader trr =
179 this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();
180 Scan sc = new Scan(this.scan);
181 sc.setStartRow(tSplit.getStartRow());
182 sc.setStopRow(tSplit.getEndRow());
183 trr.setScan(sc);
184 trr.setTable(getTable());
185 return new RecordReader<ImmutableBytesWritable, Result>() {
186
187 @Override
188 public void close() throws IOException {
189 trr.close();
190 closeTable();
191 }
192
193 @Override
194 public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
195 return trr.getCurrentKey();
196 }
197
198 @Override
199 public Result getCurrentValue() throws IOException, InterruptedException {
200 return trr.getCurrentValue();
201 }
202
203 @Override
204 public float getProgress() throws IOException, InterruptedException {
205 return trr.getProgress();
206 }
207
208 @Override
209 public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
210 InterruptedException {
211 trr.initialize(inputsplit, context);
212 }
213
214 @Override
215 public boolean nextKeyValue() throws IOException, InterruptedException {
216 return trr.nextKeyValue();
217 }
218 };
219 }
220
221 protected Pair<byte[][],byte[][]> getStartEndKeys() throws IOException {
222 return getRegionLocator().getStartEndKeys();
223 }
224
225
226
227
228
229
230
231
232
233
234
235 @Override
236 public List<InputSplit> getSplits(JobContext context) throws IOException {
237 boolean closeOnFinish = false;
238
239
240 if (table == null) {
241 initialize(context);
242 closeOnFinish = true;
243 }
244
245
246 try {
247 if (getTable() == null) {
248
249 throw new IOException(INITIALIZATION_ERROR);
250 }
251 } catch (IllegalStateException exception) {
252 throw new IOException(INITIALIZATION_ERROR, exception);
253 }
254
255 try {
256 RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, admin);
257
258 Pair<byte[][], byte[][]> keys = getStartEndKeys();
259 if (keys == null || keys.getFirst() == null ||
260 keys.getFirst().length == 0) {
261 HRegionLocation regLoc = regionLocator.getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
262 if (null == regLoc) {
263 throw new IOException("Expecting at least one region.");
264 }
265 List<InputSplit> splits = new ArrayList<InputSplit>(1);
266 long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
267 TableSplit split = new TableSplit(table.getName(),
268 HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc
269 .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
270 splits.add(split);
271 return splits;
272 }
273 List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
274 for (int i = 0; i < keys.getFirst().length; i++) {
275 if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
276 continue;
277 }
278 HRegionLocation location = regionLocator.getRegionLocation(keys.getFirst()[i], false);
279
280 InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
281 if (isa.isUnresolved()) {
282 LOG.warn("Failed resolve " + isa);
283 }
284 InetAddress regionAddress = isa.getAddress();
285 String regionLocation;
286 try {
287 regionLocation = reverseDNS(regionAddress);
288 } catch (NamingException e) {
289 LOG.warn("Cannot resolve the host name for " + regionAddress + " because of " + e);
290 regionLocation = location.getHostname();
291 }
292
293 byte[] startRow = scan.getStartRow();
294 byte[] stopRow = scan.getStopRow();
295
296 if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
297 Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
298 (stopRow.length == 0 ||
299 Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
300 byte[] splitStart = startRow.length == 0 ||
301 Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
302 keys.getFirst()[i] : startRow;
303 byte[] splitStop = (stopRow.length == 0 ||
304 Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
305 keys.getSecond()[i].length > 0 ?
306 keys.getSecond()[i] : stopRow;
307
308 byte[] regionName = location.getRegionInfo().getRegionName();
309 long regionSize = sizeCalculator.getRegionSize(regionName);
310 TableSplit split = new TableSplit(table.getName(),
311 splitStart, splitStop, regionLocation, regionSize);
312 splits.add(split);
313 if (LOG.isDebugEnabled()) {
314 LOG.debug("getSplits: split -> " + i + " -> " + split);
315 }
316 }
317 }
318
319 boolean enableAutoBalance = context.getConfiguration().getBoolean(
320 MAPREDUCE_INPUT_AUTOBALANCE, false);
321 if (enableAutoBalance) {
322 long totalRegionSize=0;
323 for (int i = 0; i < splits.size(); i++){
324 TableSplit ts = (TableSplit)splits.get(i);
325 totalRegionSize += ts.getLength();
326 }
327 long averageRegionSize = totalRegionSize / splits.size();
328
329 if (averageRegionSize <= 0) {
330 LOG.warn("The averageRegionSize is not positive: "+ averageRegionSize + ", " +
331 "set it to 1.");
332 averageRegionSize = 1;
333 }
334 return calculateRebalancedSplits(splits, context, averageRegionSize);
335 } else {
336 return splits;
337 }
338 } finally {
339 if (closeOnFinish) {
340 closeTable();
341 }
342 }
343 }
344
345
346
347
348 @Deprecated
349 public String reverseDNS(InetAddress ipAddress) throws NamingException, UnknownHostException {
350 String hostName = this.reverseDNSCacheMap.get(ipAddress);
351 if (hostName == null) {
352 String ipAddressString = null;
353 try {
354 ipAddressString = DNS.reverseDns(ipAddress, null);
355 } catch (Exception e) {
356
357
358
359 ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName();
360 }
361 if (ipAddressString == null) throw new UnknownHostException("No host found for " + ipAddress);
362 hostName = Strings.domainNamePointerToHostName(ipAddressString);
363 this.reverseDNSCacheMap.put(ipAddress, hostName);
364 }
365 return hostName;
366 }
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381 public List<InputSplit> calculateRebalancedSplits(List<InputSplit> list, JobContext context,
382 long average) throws IOException {
383 List<InputSplit> resultList = new ArrayList<InputSplit>();
384 Configuration conf = context.getConfiguration();
385
386 long dataSkewRatio = conf.getLong(INPUT_AUTOBALANCE_MAXSKEWRATIO, 3);
387
388 boolean isTextKey = context.getConfiguration().getBoolean(TABLE_ROW_TEXTKEY, true);
389 long dataSkewThreshold = dataSkewRatio * average;
390 int count = 0;
391 while (count < list.size()) {
392 TableSplit ts = (TableSplit)list.get(count);
393 String regionLocation = ts.getRegionLocation();
394 long regionSize = ts.getLength();
395 if (regionSize >= dataSkewThreshold) {
396
397
398 byte[] splitKey = getSplitKey(ts.getStartRow(), ts.getEndRow(), isTextKey);
399
400
401 TableSplit t1 = new TableSplit(table.getName(), ts.getStartRow(), splitKey, regionLocation,
402 regionSize / 2);
403 TableSplit t2 = new TableSplit(table.getName(), splitKey, ts.getEndRow(), regionLocation,
404 regionSize - regionSize / 2);
405 resultList.add(t1);
406 resultList.add(t2);
407 count++;
408 } else if (regionSize >= average) {
409
410
411 resultList.add(ts);
412 count++;
413 } else {
414
415
416 long totalSize = regionSize;
417 byte[] splitStartKey = ts.getStartRow();
418 byte[] splitEndKey = ts.getEndRow();
419 count++;
420 for (; count < list.size(); count++) {
421 TableSplit nextRegion = (TableSplit)list.get(count);
422 long nextRegionSize = nextRegion.getLength();
423 if (totalSize + nextRegionSize <= dataSkewThreshold) {
424 totalSize = totalSize + nextRegionSize;
425 splitEndKey = nextRegion.getEndRow();
426 } else {
427 break;
428 }
429 }
430 TableSplit t = new TableSplit(table.getName(), splitStartKey, splitEndKey,
431 regionLocation, totalSize);
432 resultList.add(t);
433 }
434 }
435 return resultList;
436 }
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454 public static byte[] getSplitKey(byte[] start, byte[] end, boolean isText) {
455 byte upperLimitByte;
456 byte lowerLimitByte;
457
458 if (isText) {
459
460
461 upperLimitByte = '~';
462 lowerLimitByte = ' ';
463 } else {
464 upperLimitByte = Byte.MAX_VALUE;
465 lowerLimitByte = Byte.MIN_VALUE;
466 }
467
468
469
470 if (start.length == 0 && end.length == 0){
471 return new byte[]{(byte) ((lowerLimitByte + upperLimitByte) / 2)};
472 }
473 if (start.length == 0 && end.length != 0){
474 return new byte[]{ end[0] };
475 }
476 if (start.length != 0 && end.length == 0){
477 byte[] result =new byte[start.length];
478 result[0]=start[0];
479 for (int k = 1; k < start.length; k++){
480 result[k] = upperLimitByte;
481 }
482 return result;
483 }
484
485 List resultBytesList = new ArrayList();
486 int maxLength = start.length > end.length ? start.length : end.length;
487 for (int i = 0; i < maxLength; i++) {
488
489
490
491
492 if (start[i] == end[i]) {
493 resultBytesList.add(start[i]);
494
495 if (i + 1 == start.length) {
496 resultBytesList.add((byte) ((lowerLimitByte + end[i + 1]) / 2));
497 break;
498 }
499 } else {
500
501
502 if ((int)end[i] - (int)start[i] == 1) {
503
504 byte startNextByte = (i + 1 < start.length) ? start[i + 1] : lowerLimitByte;
505 byte endNextByte = (i + 1 < end.length) ? end[i + 1] : lowerLimitByte;
506 int byteRange = (upperLimitByte - startNextByte) + (endNextByte - lowerLimitByte) + 1;
507 int halfRange = byteRange / 2;
508 if ((int)startNextByte + halfRange > (int)upperLimitByte) {
509 resultBytesList.add(end[i]);
510 resultBytesList.add((byte) (startNextByte + halfRange - upperLimitByte +
511 lowerLimitByte));
512 } else {
513 resultBytesList.add(start[i]);
514 resultBytesList.add((byte) (startNextByte + halfRange));
515 }
516 } else {
517
518
519 resultBytesList.add((byte) ((start[i] + end[i]) / 2));
520 }
521 break;
522 }
523 }
524
525 byte result[] = new byte[resultBytesList.size()];
526 for (int k = 0; k < resultBytesList.size(); k++) {
527 result[k] = (byte) resultBytesList.get(k);
528 }
529 return result;
530 }
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556 protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
557 return true;
558 }
559
560
561
562
563
564
565 @Deprecated
566 protected HTable getHTable() {
567 return (HTable) this.getTable();
568 }
569
570
571
572
573 protected RegionLocator getRegionLocator() {
574 if (regionLocator == null) {
575 throw new IllegalStateException(NOT_INITIALIZED);
576 }
577 return regionLocator;
578 }
579
580
581
582
583 protected Table getTable() {
584 if (table == null) {
585 throw new IllegalStateException(NOT_INITIALIZED);
586 }
587 return table;
588 }
589
590
591
592
593 protected Admin getAdmin() {
594 if (admin == null) {
595 throw new IllegalStateException(NOT_INITIALIZED);
596 }
597 return admin;
598 }
599
600
601
602
603
604
605
606
607
608
609
610 @Deprecated
611 protected void setHTable(HTable table) throws IOException {
612 this.table = table;
613 this.connection = table.getConnection();
614 try {
615 this.regionLocator = table.getRegionLocator();
616 this.admin = this.connection.getAdmin();
617 } catch (NeedUnmanagedConnectionException exception) {
618 LOG.warn("You are using an HTable instance that relies on an HBase-managed Connection. " +
619 "This is usually due to directly creating an HTable, which is deprecated. Instead, you " +
620 "should create a Connection object and then request a Table instance from it. If you " +
621 "don't need the Table instance for your own use, you should instead use the " +
622 "TableInputFormatBase.initalizeTable method directly.");
623 LOG.info("Creating an additional unmanaged connection because user provided one can't be " +
624 "used for administrative actions. We'll close it when we close out the table.");
625 LOG.debug("Details about our failure to request an administrative interface.", exception);
626
627
628 this.connection = ConnectionFactory.createConnection(this.connection.getConfiguration());
629 this.regionLocator = this.connection.getRegionLocator(table.getName());
630 this.admin = this.connection.getAdmin();
631 }
632 }
633
634
635
636
637
638
639
640
641 protected void initializeTable(Connection connection, TableName tableName) throws IOException {
642 if (this.table != null || this.connection != null) {
643 LOG.warn("initializeTable called multiple times. Overwriting connection and table " +
644 "reference; TableInputFormatBase will not close these old references when done.");
645 }
646 this.table = connection.getTable(tableName);
647 this.regionLocator = connection.getRegionLocator(tableName);
648 this.admin = connection.getAdmin();
649 this.connection = connection;
650 }
651
652
653
654
655
656
657 public Scan getScan() {
658 if (this.scan == null) this.scan = new Scan();
659 return scan;
660 }
661
662
663
664
665
666
667 public void setScan(Scan scan) {
668 this.scan = scan;
669 }
670
671
672
673
674
675
676
677 protected void setTableRecordReader(TableRecordReader tableRecordReader) {
678 this.tableRecordReader = tableRecordReader;
679 }
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696 protected void initialize(JobContext context) throws IOException {
697 }
698
699
700
701
702
703
704
705 protected void closeTable() throws IOException {
706 close(admin, table, regionLocator, connection);
707 admin = null;
708 table = null;
709 regionLocator = null;
710 connection = null;
711 }
712
713 private void close(Closeable... closables) throws IOException {
714 for (Closeable c : closables) {
715 if(c != null) { c.close(); }
716 }
717 }
718
719 }