View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.mapreduce;
21  
22  import java.util.Iterator;
23  import java.util.List;
24  import java.util.TreeSet;
25  
26  import org.apache.hadoop.hbase.KeyValue;
27  import org.apache.hadoop.hbase.client.Put;
28  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
29  import org.apache.hadoop.mapreduce.Reducer;
30  import org.apache.hadoop.util.StringUtils;
31  
32  /**
33   * Emits sorted Puts.
34   * Reads in all Puts from passed Iterator, sorts them, then emits
35   * Puts in sorted order.  If lots of columns per row, it will use lots of
36   * memory sorting.
37   * @see HFileOutputFormat
38   * @see KeyValueSortReducer
39   */
40  public class PutSortReducer extends
41      Reducer<ImmutableBytesWritable, Put, ImmutableBytesWritable, KeyValue> {
42    
43    @Override
44    protected void reduce(
45        ImmutableBytesWritable row,
46        java.lang.Iterable<Put> puts,
47        Reducer<ImmutableBytesWritable, Put,
48                ImmutableBytesWritable, KeyValue>.Context context)
49        throws java.io.IOException, InterruptedException
50    {
51      // although reduce() is called per-row, handle pathological case
52      long threshold = context.getConfiguration().getLong(
53          "putsortreducer.row.threshold", 2L * (1<<30));
54      Iterator<Put> iter = puts.iterator();
55      while (iter.hasNext()) {
56        TreeSet<KeyValue> map = new TreeSet<KeyValue>(KeyValue.COMPARATOR);
57        long curSize = 0;
58        // stop at the end or the RAM threshold
59        while (iter.hasNext() && curSize < threshold) {
60          Put p = iter.next();
61          for (List<KeyValue> kvs : p.getFamilyMap().values()) {
62            for (KeyValue kv : kvs) {
63              map.add(kv);
64              curSize += kv.getLength();
65            }
66          }
67        }
68        context.setStatus("Read " + map.size() + " entries of " + map.getClass()
69            + "(" + StringUtils.humanReadableInt(curSize) + ")");
70        int index = 0;
71        for (KeyValue kv : map) {
72          context.write(row, kv);
73          if (++index % 100 == 0)
74            context.setStatus("Wrote " + index);
75        }
76  
77        // if we have more entries to process
78        if (iter.hasNext()) {
79          // force flush because we cannot guarantee intra-row sorted order
80          context.write(null, null);
81        }
82      }
83    }
84  }