View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.client;
20  
21  import org.apache.hadoop.classification.InterfaceAudience;
22  import org.apache.hadoop.hbase.HRegionLocation;
23  
24  import java.io.IOException;
25  import java.io.InterruptedIOException;
26  import java.util.ArrayList;
27  import java.util.HashMap;
28  import java.util.List;
29  import java.util.Map;
30  
31  /**
32   * Utility class for HTable.
33   * 
34   *
35   */
36  @InterfaceAudience.Private
37  public class HTableUtil {
38  
39    private static final int INITIAL_LIST_SIZE = 250;
40  	
41    /**
42     * Processes a List of Puts and writes them to an HTable instance in RegionServer buckets via the htable.put method. 
43     * This will utilize the writeBuffer, thus the writeBuffer flush frequency may be tuned accordingly via htable.setWriteBufferSize. 
44     * <br><br>
45     * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs in each flush.
46     * <br><br>
47     * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
48     * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
49     * <br>
50     * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
51     * <br>
52     * Assumption #3:  That the input list of Puts is big enough to be useful (in the thousands or more).  The intent of this
53     * method is to process larger chunks of data.
54     * <br>
55     * Assumption #4:  htable.setAutoFlush(false) has been set.  This is a requirement to use the writeBuffer.
56     * <br><br>
57     * @param htable HTable instance for target HBase table
58     * @param puts List of Put instances
59     * @throws IOException if a remote or network exception occurs
60     * 
61     */
62    public static void bucketRsPut(HTable htable, List<Put> puts) throws IOException {
63  
64      Map<String, List<Put>> putMap = createRsPutMap(htable, puts);
65      for (List<Put> rsPuts: putMap.values()) {
66        htable.put( rsPuts );
67      }
68      htable.flushCommits();
69    }
70  	
71    /**
72     * Processes a List of Rows (Put, Delete) and writes them to an HTable instance in RegionServer buckets via the htable.batch method. 
73     * <br><br>
74     * The benefit of submitting Puts in this manner is to minimize the number of RegionServer RPCs, thus this will
75     * produce one RPC of Puts per RegionServer.
76     * <br><br>
77     * Assumption #1:  Regions have been pre-created for the table.  If they haven't, then all of the Puts will go to the same region, 
78     * defeating the purpose of this utility method. See the Apache HBase book for an explanation of how to do this.
79     * <br>
80     * Assumption #2:  Row-keys are not monotonically increasing.  See the Apache HBase book for an explanation of this problem.  
81     * <br>
82     * Assumption #3:  That the input list of Rows is big enough to be useful (in the thousands or more).  The intent of this
83     * method is to process larger chunks of data.
84     * <br><br>
85     * This method accepts a list of Row objects because the underlying .batch method accepts a list of Row objects.
86     * <br><br>
87     * @param htable HTable instance for target HBase table
88     * @param rows List of Row instances
89     * @throws IOException if a remote or network exception occurs
90     */
91    public static void bucketRsBatch(HTable htable, List<Row> rows) throws IOException {
92  
93      try {
94        Map<String, List<Row>> rowMap = createRsRowMap(htable, rows);
95        for (List<Row> rsRows: rowMap.values()) {
96          htable.batch( rsRows );
97        }
98      } catch (InterruptedException e) {
99        throw (InterruptedIOException)new InterruptedIOException().initCause(e);
100     }
101 		
102   }
103 
104   private static Map<String,List<Put>> createRsPutMap(RegionLocator htable, List<Put> puts) throws IOException {
105 
106     Map<String, List<Put>> putMap = new HashMap<String, List<Put>>();
107     for (Put put: puts) {
108       HRegionLocation rl = htable.getRegionLocation( put.getRow() );
109       String hostname = rl.getHostname();
110       List<Put> recs = putMap.get( hostname);
111       if (recs == null) {
112         recs = new ArrayList<Put>(INITIAL_LIST_SIZE);
113     		putMap.put( hostname, recs);
114       }
115       recs.add(put);
116     }
117     return putMap;
118   }
119 
120   private static Map<String,List<Row>> createRsRowMap(RegionLocator htable, List<Row> rows) throws IOException {
121 
122     Map<String, List<Row>> rowMap = new HashMap<String, List<Row>>();
123     for (Row row: rows) {
124       HRegionLocation rl = htable.getRegionLocation( row.getRow() );
125       String hostname = rl.getHostname();
126       List<Row> recs = rowMap.get( hostname);
127       if (recs == null) {
128         recs = new ArrayList<Row>(INITIAL_LIST_SIZE);
129         rowMap.put( hostname, recs);
130       }
131       recs.add(row);
132     }
133     return rowMap;
134   }
135 		
136 }