001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.mapreduce;
020
021import java.io.IOException;
022import org.apache.hadoop.conf.Configuration;
023import org.apache.hadoop.conf.Configured;
024import org.apache.hadoop.fs.Path;
025import org.apache.hadoop.hbase.HBaseConfiguration;
026import org.apache.hadoop.hbase.client.Put;
027import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
028import org.apache.hadoop.hbase.util.Bytes;
029import org.apache.hadoop.io.LongWritable;
030import org.apache.hadoop.io.Text;
031import org.apache.hadoop.mapreduce.Job;
032import org.apache.hadoop.mapreduce.Mapper;
033import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
034import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
035import org.apache.hadoop.util.Tool;
036import org.apache.hadoop.util.ToolRunner;
037import org.apache.yetus.audience.InterfaceAudience;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041/**
042 * Sample Uploader MapReduce
043 * <p>
044 * This is EXAMPLE code.  You will need to change it to work for your context.
045 * <p>
046 * Uses {@link TableReducer} to put the data into HBase. Change the InputFormat
047 * to suit your data.  In this example, we are importing a CSV file.
048 * <p>
049 * <pre>row,family,qualifier,value</pre>
050 * <p>
051 * The table and columnfamily we're to insert into must preexist.
052 * <p>
053 * There is no reducer in this example as it is not necessary and adds
054 * significant overhead.  If you need to do any massaging of data before
055 * inserting into HBase, you can do this in the map as well.
056 * <p>Do the following to start the MR job:
057 * <pre>
058 * ./bin/hadoop org.apache.hadoop.hbase.mapreduce.SampleUploader /tmp/input.csv TABLE_NAME
059 * </pre>
060 * <p>
061 * This code was written against HBase 0.21 trunk.
062 */
063@InterfaceAudience.Private
064public class SampleUploader extends Configured implements Tool {
065  private static final Logger LOG = LoggerFactory.getLogger(SampleUploader.class);
066
067  private static final String NAME = "SampleUploader";
068
069  static class Uploader extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
070    private long checkpoint = 100;
071    private long count = 0;
072
073    @Override
074    public void map(LongWritable key, Text line, Context context) throws IOException {
075      // Input is a CSV file
076      // Each map() is a single line, where the key is the line number
077      // Each line is comma-delimited; row,family,qualifier,value
078
079      // Split CSV line
080      String [] values = line.toString().split(",");
081      if(values.length != 4) {
082        return;
083      }
084
085      // Extract each value
086      byte [] row = Bytes.toBytes(values[0]);
087      byte [] family = Bytes.toBytes(values[1]);
088      byte [] qualifier = Bytes.toBytes(values[2]);
089      byte [] value = Bytes.toBytes(values[3]);
090
091      // Create Put
092      Put put = new Put(row);
093      put.addColumn(family, qualifier, value);
094
095      // Uncomment below to disable WAL. This will improve performance but means
096      // you will experience data loss in the case of a RegionServer crash.
097      // put.setWriteToWAL(false);
098
099      try {
100        context.write(new ImmutableBytesWritable(row), put);
101      } catch (InterruptedException e) {
102        LOG.error("Interrupted emitting put", e);
103        Thread.currentThread().interrupt();
104      }
105
106      // Set status every checkpoint lines
107      if(++count % checkpoint == 0) {
108        context.setStatus("Emitting Put " + count);
109      }
110    }
111  }
112
113  /**
114   * Job configuration.
115   */
116  public static Job configureJob(Configuration conf, String [] args) throws IOException {
117    Path inputPath = new Path(args[0]);
118    String tableName = args[1];
119    Job job = new Job(conf, NAME + "_" + tableName);
120    job.setJarByClass(Uploader.class);
121    FileInputFormat.setInputPaths(job, inputPath);
122    job.setInputFormatClass(SequenceFileInputFormat.class);
123    job.setMapperClass(Uploader.class);
124    // No reducers.  Just write straight to table.  Call initTableReducerJob
125    // because it sets up the TableOutputFormat.
126    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
127    job.setNumReduceTasks(0);
128    return job;
129  }
130
131  /**
132   * Main entry point.
133   *
134   * @param otherArgs  The command line parameters after ToolRunner handles standard.
135   * @throws Exception When running the job fails.
136   */
137  public int run(String[] otherArgs) throws Exception {
138    if(otherArgs.length != 2) {
139      System.err.println("Wrong number of arguments: " + otherArgs.length);
140      System.err.println("Usage: " + NAME + " <input> <tablename>");
141      return -1;
142    }
143    Job job = configureJob(getConf(), otherArgs);
144    return (job.waitForCompletion(true) ? 0 : 1);
145  }
146
147  public static void main(String[] args) throws Exception {
148    int status = ToolRunner.run(HBaseConfiguration.create(), new SampleUploader(), args);
149    System.exit(status);
150  }
151}