View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import org.apache.hadoop.classification.InterfaceAudience;
22  import org.apache.hadoop.classification.InterfaceStability;
23  import org.apache.hadoop.hbase.HRegionLocation;
24  
25  import java.io.IOException;
26  import java.io.InterruptedIOException;
27  import java.util.ArrayList;
28  import java.util.HashMap;
29  import java.util.List;
30  import java.util.Map;
31  
32  /**
33   * Utility class for HTable.
34   * 
35   *
36   */
37  @InterfaceAudience.Private
38  public class HTableUtil {
39  
40    private static final int INITIAL_LIST_SIZE = 250;
41  	
42    /**
43     * Processes a List of Puts and writes them to an HTable instance in RegionServer buckets via the htable.put method. 
44     * This will utilize the writeBuffer, thus the writeBuffer flush frequency may be tuned accordingly via htable.setWriteBufferSize. 
45     * <br><br>
46     * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs in each flush.
47     * <br><br>
48     * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
49     * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
50     * <br>
51     * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
52     * <br>
53     * Assumption #3:  That the input list of Puts is big enough to be useful (in the thousands or more).  The intent of this
54     * method is to process larger chunks of data.
55     * <br>
56     * Assumption #4:  htable.setAutoFlush(false) has been set.  This is a requirement to use the writeBuffer.
57     * <br><br>
58     * @param htable HTable instance for target HBase table
59     * @param puts List of Put instances
60     * @throws IOException if a remote or network exception occurs
61     * 
62     */
63    public static void bucketRsPut(HTable htable, List<Put> puts) throws IOException {
64  
65      Map<String, List<Put>> putMap = createRsPutMap(htable, puts);
66      for (List<Put> rsPuts: putMap.values()) {
67        htable.put( rsPuts );
68      }
69      htable.flushCommits();
70    }
71  	
72    /**
73     * Processes a List of Rows (Put, Delete) and writes them to an HTable instance in RegionServer buckets via the htable.batch method. 
74     * <br><br>
75     * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs, thus this will
76     * produce one RPC of Puts per RegionServer.
77     * <br><br>
78     * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
79     * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
80     * <br>
81     * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
82     * <br>
83     * Assumption #3:  That the input list of Rows is big enough to be useful (in the thousands or more).  The intent of this
84     * method is to process larger chunks of data.
85     * <br><br>
86     * This method accepts a list of Row objects because the underlying .batch method accepts a list of Row objects.
87     * <br><br>
88     * @param htable HTable instance for target HBase table
89     * @param rows List of Row instances
90     * @throws IOException if a remote or network exception occurs
91     */
92    public static void bucketRsBatch(HTable htable, List<Row> rows) throws IOException {
93  
94      try {
95        Map<String, List<Row>> rowMap = createRsRowMap(htable, rows);
96        for (List<Row> rsRows: rowMap.values()) {
97          htable.batch( rsRows );
98        }
99      } catch (InterruptedException e) {
100       throw (InterruptedIOException)new InterruptedIOException().initCause(e);
101     }
102 		
103   }
104 
105   private static Map<String,List<Put>> createRsPutMap(HTable htable, List<Put> puts) throws IOException {
106 
107     Map<String, List<Put>> putMap = new HashMap<String, List<Put>>();
108     for (Put put: puts) {
109       HRegionLocation rl = htable.getRegionLocation( put.getRow() );
110       String hostname = rl.getHostname();
111       List<Put> recs = putMap.get( hostname);
112       if (recs == null) {
113         recs = new ArrayList<Put>(INITIAL_LIST_SIZE);
114     		putMap.put( hostname, recs);
115       }
116       recs.add(put);
117     }
118     return putMap;
119   }
120 
121   private static Map<String,List<Row>> createRsRowMap(HTable htable, List<Row> rows) throws IOException {
122 
123     Map<String, List<Row>> rowMap = new HashMap<String, List<Row>>();
124     for (Row row: rows) {
125       HRegionLocation rl = htable.getRegionLocation( row.getRow() );
126       String hostname = rl.getHostname();
127       List<Row> recs = rowMap.get( hostname);
128       if (recs == null) {
129         recs = new ArrayList<Row>(INITIAL_LIST_SIZE);
130         rowMap.put( hostname, recs);
131       }
132       recs.add(row);
133     }
134     return rowMap;
135   }
136 		
137 }