Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.mapreduce;
019
020import java.io.IOException;
021import java.util.Collection;
022import java.util.List;
023import java.util.Map;
024import org.apache.hadoop.conf.Configuration;
025import org.apache.hadoop.fs.Path;
026import org.apache.hadoop.hbase.client.Scan;
027import org.apache.hadoop.mapreduce.InputSplit;
028import org.apache.hadoop.mapreduce.JobContext;
029import org.apache.yetus.audience.InterfaceAudience;
030
031import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
032
033/**
034 * MultiTableSnapshotInputFormat generalizes {@link TableSnapshotInputFormat} allowing a MapReduce
035 * job to run over one or more table snapshots, with one or more scans configured for each.
036 * Internally, the input format delegates to {@link TableSnapshotInputFormat} and thus has the same
037 * performance advantages; see {@link TableSnapshotInputFormat} for more details. Usage is similar
038 * to TableSnapshotInputFormat, with the following exception: initMultiTableSnapshotMapperJob takes
039 * in a map from snapshot name to a collection of scans. For each snapshot in the map, each
040 * corresponding scan will be applied; the overall dataset for the job is defined by the
041 * concatenation of the regions and tables included in each snapshot/scan pair.
042 * {@link TableMapReduceUtil#initMultiTableSnapshotMapperJob (Map, Class, Class, Class, org.apache.hadoop.mapreduce.Job, boolean, Path)}
043 * can be used to configure the job.
044 *
045 * <pre>
046 * {@code
047 * Job job = new Job(conf);
048 * Map<String, Collection<Scan>> snapshotScans = ImmutableMap.of(
049 *    "snapshot1", ImmutableList.of(new Scan(Bytes.toBytes("a"), Bytes.toBytes("b"))),
050 *    "snapshot2", ImmutableList.of(new Scan(Bytes.toBytes("1"), Bytes.toBytes("2")))
051 * );
052 * Path restoreDir = new Path("/tmp/snapshot_restore_dir")
053 * TableMapReduceUtil.initTableSnapshotMapperJob(
054 *     snapshotScans, MyTableMapper.class, MyMapKeyOutput.class,
055 *      MyMapOutputValueWritable.class, job, true, restoreDir);
056 * }
057 * </pre>
058 *
059 * Internally, this input format restores each snapshot into a subdirectory of the given tmp
060 * directory. Input splits and record readers are created as described in
061 * {@link org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat} (one per region). See
062 * {@link TableSnapshotInputFormat} for more notes on permissioning; the same caveats apply here.
063 * @see TableSnapshotInputFormat
064 * @see org.apache.hadoop.hbase.client.TableSnapshotScanner
065 */
066@InterfaceAudience.Public
067public class MultiTableSnapshotInputFormat extends TableSnapshotInputFormat {
068
069  private final MultiTableSnapshotInputFormatImpl delegate;
070
071  public MultiTableSnapshotInputFormat() {
072    this.delegate = new MultiTableSnapshotInputFormatImpl();
073  }
074
075  @Override
076  public List<InputSplit> getSplits(JobContext jobContext)
077    throws IOException, InterruptedException {
078    List<TableSnapshotInputFormatImpl.InputSplit> splits =
079      delegate.getSplits(jobContext.getConfiguration());
080    List<InputSplit> rtn = Lists.newArrayListWithCapacity(splits.size());
081
082    for (TableSnapshotInputFormatImpl.InputSplit split : splits) {
083      rtn.add(new TableSnapshotInputFormat.TableSnapshotRegionSplit(split));
084    }
085
086    return rtn;
087  }
088
089  public static void setInput(Configuration configuration,
090    Map<String, Collection<Scan>> snapshotScans, Path tmpRestoreDir) throws IOException {
091    new MultiTableSnapshotInputFormatImpl().setInput(configuration, snapshotScans, tmpRestoreDir);
092  }
093}