View Javadoc

1   /**
2    * Copyright 2011 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.client;
21  
22  import java.io.IOException;
23  import java.lang.InterruptedException;
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  import org.apache.hadoop.hbase.HRegionLocation;
29  import org.apache.hadoop.hbase.client.HTable;
30  import org.apache.hadoop.hbase.client.Put;
31  import org.apache.hadoop.hbase.client.Row;
32  
33  /**
34   * Utility class for HTable.
35   * 
36   *
37   */
38  public class HTableUtil {
39  
40    private static final int INITIAL_LIST_SIZE = 250;
41  	
42    /**
43     * Processes a List of Puts and writes them to an HTable instance in RegionServer buckets via the htable.put method. 
44     * This will utilize the writeBuffer, thus the writeBuffer flush frequency may be tuned accordingly via htable.setWriteBufferSize. 
45     * <br><br>
46     * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs in each flush.
47     * <br><br>
48     * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
49     * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
50     * <br>
51     * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
52     * <br>
53     * Assumption #3:  That the input list of Puts is big enough to be useful (in the thousands or more).  The intent of this
54     * method is to process larger chunks of data.
55     * <br>
56     * Assumption #4:  htable.setAutoFlush(false) has been set.  This is a requirement to use the writeBuffer.
57     * <br><br>
58     * @param htable HTable instance for target HBase table
59     * @param puts List of Put instances
60     * @throws IOException if a remote or network exception occurs
61     * 
62     */
63    public static void bucketRsPut(HTable htable, List<Put> puts) throws IOException {
64  
65      Map<String, List<Put>> putMap = createRsPutMap(htable, puts);
66      for (List<Put> rsPuts: putMap.values()) {
67        htable.put( rsPuts );
68      }
69      htable.flushCommits();
70    }
71  	
72    /**
73     * Processes a List of Rows (Put, Delete) and writes them to an HTable instance in RegionServer buckets via the htable.batch method. 
74     * <br><br>
75     * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs, thus this will
76     * produce one RPC of Puts per RegionServer.
77     * <br><br>
78     * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
79     * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
80     * <br>
81     * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
82     * <br>
83     * Assumption #3:  That the input list of Rows is big enough to be useful (in the thousands or more).  The intent of this
84     * method is to process larger chunks of data.
85     * <br><br>
86     * This method accepts a list of Row objects because the underlying .batch method accepts a list of Row objects.
87     * <br><br>
88     * @param htable HTable instance for target HBase table
89     * @param rows List of Row instances
90     * @throws IOException if a remote or network exception occurs
91     */
92    public static void bucketRsBatch(HTable htable, List<Row> rows) throws IOException {
93  
94      try {
95        Map<String, List<Row>> rowMap = createRsRowMap(htable, rows);
96        for (List<Row> rsRows: rowMap.values()) {
97          htable.batch( rsRows );
98        }
99      } catch (InterruptedException e) {
100       throw new IOException(e); 
101     }
102 		
103   }
104 
105   private static Map<String,List<Put>> createRsPutMap(HTable htable, List<Put> puts) throws IOException {
106 
107     Map<String, List<Put>> putMap = new HashMap<String, List<Put>>();
108     for (Put put: puts) {
109       HRegionLocation rl = htable.getRegionLocation( put.getRow() );
110       String hostname = rl.getHostname();
111       List<Put> recs = putMap.get( hostname);
112       if (recs == null) {
113         recs = new ArrayList<Put>(INITIAL_LIST_SIZE);
114     		putMap.put( hostname, recs);
115       }
116       recs.add(put);
117     }
118     return putMap;
119   }
120 
121   private static Map<String,List<Row>> createRsRowMap(HTable htable, List<Row> rows) throws IOException {
122 
123     Map<String, List<Row>> rowMap = new HashMap<String, List<Row>>();
124     for (Row row: rows) {
125       HRegionLocation rl = htable.getRegionLocation( row.getRow() );
126       String hostname = rl.getHostname();
127       List<Row> recs = rowMap.get( hostname);
128       if (recs == null) {
129         recs = new ArrayList<Row>(INITIAL_LIST_SIZE);
130         rowMap.put( hostname, recs);
131       }
132       recs.add(row);
133     }
134     return rowMap;
135   }
136 		
137 }