001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.mapreduce; 020 021import java.io.IOException; 022import org.apache.yetus.audience.InterfaceAudience; 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025import org.apache.hadoop.conf.Configurable; 026import org.apache.hadoop.conf.Configuration; 027import org.apache.hadoop.hbase.HBaseConfiguration; 028import org.apache.hadoop.hbase.TableName; 029import org.apache.hadoop.hbase.client.Connection; 030import org.apache.hadoop.hbase.client.ConnectionFactory; 031import org.apache.hadoop.hbase.client.RegionLocator; 032import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 033import org.apache.hadoop.hbase.mapred.TableOutputFormat; 034import org.apache.hadoop.hbase.util.Bytes; 035import org.apache.hadoop.mapreduce.Partitioner; 036 037/** 038 * This is used to partition the output keys into groups of keys. 039 * Keys are grouped according to the regions that currently exist 040 * so that each reducer fills a single region so load is distributed. 041 * 042 * <p>This class is not suitable as partitioner creating hfiles 043 * for incremental bulk loads as region spread will likely change between time of 044 * hfile creation and load time. See {@link org.apache.hadoop.hbase.tool.LoadIncrementalHFiles} 045 * and <a href="http://hbase.apache.org/book.html#arch.bulk.load">Bulk Load</a>.</p> 046 * 047 * @param <KEY> The type of the key. 048 * @param <VALUE> The type of the value. 049 */ 050@InterfaceAudience.Public 051public class HRegionPartitioner<KEY, VALUE> 052extends Partitioner<ImmutableBytesWritable, VALUE> 053implements Configurable { 054 055 private static final Logger LOG = LoggerFactory.getLogger(HRegionPartitioner.class); 056 private Configuration conf = null; 057 // Connection and locator are not cleaned up; they just die when partitioner is done. 058 private Connection connection; 059 private RegionLocator locator; 060 private byte[][] startKeys; 061 062 /** 063 * Gets the partition number for a given key (hence record) given the total 064 * number of partitions i.e. number of reduce-tasks for the job. 065 * 066 * <p>Typically a hash function on a all or a subset of the key.</p> 067 * 068 * @param key The key to be partitioned. 069 * @param value The entry value. 070 * @param numPartitions The total number of partitions. 071 * @return The partition number for the <code>key</code>. 072 * @see org.apache.hadoop.mapreduce.Partitioner#getPartition( 073 * java.lang.Object, java.lang.Object, int) 074 */ 075 @Override 076 public int getPartition(ImmutableBytesWritable key, 077 VALUE value, int numPartitions) { 078 byte[] region = null; 079 // Only one region return 0 080 if (this.startKeys.length == 1){ 081 return 0; 082 } 083 try { 084 // Not sure if this is cached after a split so we could have problems 085 // here if a region splits while mapping 086 region = this.locator.getRegionLocation(key.get()).getRegionInfo().getStartKey(); 087 } catch (IOException e) { 088 LOG.error(e.toString(), e); 089 } 090 for (int i = 0; i < this.startKeys.length; i++){ 091 if (Bytes.compareTo(region, this.startKeys[i]) == 0 ){ 092 if (i >= numPartitions){ 093 // cover if we have less reduces then regions. 094 return (Integer.toString(i).hashCode() 095 & Integer.MAX_VALUE) % numPartitions; 096 } 097 return i; 098 } 099 } 100 // if above fails to find start key that match we need to return something 101 return 0; 102 } 103 104 /** 105 * Returns the current configuration. 106 * 107 * @return The current configuration. 108 * @see org.apache.hadoop.conf.Configurable#getConf() 109 */ 110 @Override 111 public Configuration getConf() { 112 return conf; 113 } 114 115 /** 116 * Sets the configuration. This is used to determine the start keys for the 117 * given table. 118 * 119 * @param configuration The configuration to set. 120 * @see org.apache.hadoop.conf.Configurable#setConf( 121 * org.apache.hadoop.conf.Configuration) 122 */ 123 @Override 124 public void setConf(Configuration configuration) { 125 this.conf = HBaseConfiguration.create(configuration); 126 try { 127 this.connection = ConnectionFactory.createConnection(HBaseConfiguration.create(conf)); 128 TableName tableName = TableName.valueOf(conf.get(TableOutputFormat.OUTPUT_TABLE)); 129 this.locator = this.connection.getRegionLocator(tableName); 130 } catch (IOException e) { 131 LOG.error(e.toString(), e); 132 } 133 try { 134 this.startKeys = this.locator.getStartKeys(); 135 } catch (IOException e) { 136 LOG.error(e.toString(), e); 137 } 138 } 139}