View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.util;
21  
22  import java.io.IOException;
23  import java.math.BigInteger;
24  import java.util.Arrays;
25  import java.util.Collections;
26  import java.util.Comparator;
27  import java.util.LinkedList;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Set;
31  import java.util.TreeMap;
32  
33  import org.apache.commons.cli.CommandLine;
34  import org.apache.commons.cli.GnuParser;
35  import org.apache.commons.cli.HelpFormatter;
36  import org.apache.commons.cli.OptionBuilder;
37  import org.apache.commons.cli.Options;
38  import org.apache.commons.cli.ParseException;
39  import org.apache.commons.lang.ArrayUtils;
40  import org.apache.commons.lang.StringUtils;
41  import org.apache.commons.logging.Log;
42  import org.apache.commons.logging.LogFactory;
43  import org.apache.hadoop.conf.Configuration;
44  import org.apache.hadoop.fs.FSDataInputStream;
45  import org.apache.hadoop.fs.FSDataOutputStream;
46  import org.apache.hadoop.fs.FileStatus;
47  import org.apache.hadoop.fs.FileSystem;
48  import org.apache.hadoop.fs.Path;
49  import org.apache.hadoop.hbase.HBaseConfiguration;
50  import org.apache.hadoop.hbase.HColumnDescriptor;
51  import org.apache.hadoop.hbase.HConstants;
52  import org.apache.hadoop.hbase.HRegionInfo;
53  import org.apache.hadoop.hbase.HRegionLocation;
54  import org.apache.hadoop.hbase.HTableDescriptor;
55  import org.apache.hadoop.hbase.ServerName;
56  import org.apache.hadoop.hbase.client.HBaseAdmin;
57  import org.apache.hadoop.hbase.client.HTable;
58  import org.apache.hadoop.hbase.client.NoServerForRegionException;
59  import org.apache.hadoop.hbase.regionserver.Store;
60  import org.apache.hadoop.hbase.regionserver.StoreFile;
61  
62  import com.google.common.base.Preconditions;
63  import com.google.common.collect.Lists;
64  import com.google.common.collect.Maps;
65  import com.google.common.collect.Sets;
66  
67  /**
68   * The {@link RegionSplitter} class provides several utilities to help in the
69   * administration lifecycle for developers who choose to manually split regions
70   * instead of having HBase handle that automatically. The most useful utilities
71   * are:
72   * <p>
73   * <ul>
74   * <li>Create a table with a specified number of pre-split regions
75   * <li>Execute a rolling split of all regions on an existing table
76   * </ul>
77   * <p>
78   * Both operations can be safely done on a live server.
79   * <p>
80   * <b>Question:</b> How do I turn off automatic splitting? <br>
81   * <b>Answer:</b> Automatic splitting is determined by the configuration value
82   * <i>HConstants.HREGION_MAX_FILESIZE</i>. It is not recommended that you set this
83   * to Long.MAX_VALUE in case you forget about manual splits. A suggested setting
84   * is 100GB, which would result in > 1hr major compactions if reached.
85   * <p>
86   * <b>Question:</b> Why did the original authors decide to manually split? <br>
87   * <b>Answer:</b> Specific workload characteristics of our use case allowed us
88   * to benefit from a manual split system.
89   * <p>
90   * <ul>
91   * <li>Data (~1k) that would grow instead of being replaced
92   * <li>Data growth was roughly uniform across all regions
93   * <li>OLTP workload. Data loss is a big deal.
94   * </ul>
95   * <p>
96   * <b>Question:</b> Why is manual splitting good for this workload? <br>
97   * <b>Answer:</b> Although automated splitting is not a bad option, there are
98   * benefits to manual splitting.
99   * <p>
100  * <ul>
101  * <li>With growing amounts of data, splits will continually be needed. Since
102  * you always know exactly what regions you have, long-term debugging and
103  * profiling is much easier with manual splits. It is hard to trace the logs to
104  * understand region level problems if it keeps splitting and getting renamed.
105  * <li>Data offlining bugs + unknown number of split regions == oh crap! If an
106  * HLog or StoreFile was mistakenly unprocessed by HBase due to a weird bug and
107  * you notice it a day or so later, you can be assured that the regions
108  * specified in these files are the same as the current regions and you have
109  * less headaches trying to restore/replay your data.
110  * <li>You can finely tune your compaction algorithm. With roughly uniform data
111  * growth, it's easy to cause split / compaction storms as the regions all
112  * roughly hit the same data size at the same time. With manual splits, you can
113  * let staggered, time-based major compactions spread out your network IO load.
114  * </ul>
115  * <p>
116  * <b>Question:</b> What's the optimal number of pre-split regions to create? <br>
117  * <b>Answer:</b> Mileage will vary depending upon your application.
118  * <p>
119  * The short answer for our application is that we started with 10 pre-split
120  * regions / server and watched our data growth over time. It's better to err on
121  * the side of too little regions and rolling split later.
122  * <p>
123  * The more complicated answer is that this depends upon the largest storefile
124  * in your region. With a growing data size, this will get larger over time. You
125  * want the largest region to be just big enough that the {@link Store} compact
126  * selection algorithm only compacts it due to a timed major. If you don't, your
127  * cluster can be prone to compaction storms as the algorithm decides to run
128  * major compactions on a large series of regions all at once. Note that
129  * compaction storms are due to the uniform data growth, not the manual split
130  * decision.
131  * <p>
132  * If you pre-split your regions too thin, you can increase the major compaction
133  * interval by configuring HConstants.MAJOR_COMPACTION_PERIOD. If your data size
134  * grows too large, use this script to perform a network IO safe rolling split
135  * of all regions.
136  */
137 public class RegionSplitter {
138   static final Log LOG = LogFactory.getLog(RegionSplitter.class);
139 
140   /**
141    * A generic interface for the RegionSplitter code to use for all it's
142    * functionality. Note that the original authors of this code use
143    * {@link HexStringSplit} to partition their table and set it as default, but
144    * provided this for your custom algorithm. To use, create a new derived class
145    * from this interface and call {@link RegionSplitter#createPresplitTable} or
146    * {@link RegionSplitter#rollingSplit(String, String, Configuration)} with the
147    * argument splitClassName giving the name of your class.
148    */
149   public static interface SplitAlgorithm {
150     /**
151      * Split a pre-existing region into 2 regions.
152      *
153      * @param start
154      *          first row (inclusive)
155      * @param end
156      *          last row (exclusive)
157      * @return the split row to use
158      */
159     byte[] split(byte[] start, byte[] end);
160 
161     /**
162      * Split an entire table.
163      *
164      * @param numRegions
165      *          number of regions to split the table into
166      *
167      * @throws RuntimeException
168      *           user input is validated at this time. may throw a runtime
169      *           exception in response to a parse failure
170      * @return array of split keys for the initial regions of the table. The
171      *         length of the returned array should be numRegions-1.
172      */
173     byte[][] split(int numRegions);
174 
175     /**
176      * In HBase, the first row is represented by an empty byte array. This might
177      * cause problems with your split algorithm or row printing. All your APIs
178      * will be passed firstRow() instead of empty array.
179      *
180      * @return your representation of your first row
181      */
182     byte[] firstRow();
183 
184     /**
185      * In HBase, the last row is represented by an empty byte array. This might
186      * cause problems with your split algorithm or row printing. All your APIs
187      * will be passed firstRow() instead of empty array.
188      *
189      * @return your representation of your last row
190      */
191     byte[] lastRow();
192 
193     /**
194      * In HBase, the last row is represented by an empty byte array. Set this
195      * value to help the split code understand how to evenly divide the first
196      * region.
197      *
198      * @param userInput
199      *          raw user input (may throw RuntimeException on parse failure)
200      */
201     void setFirstRow(String userInput);
202 
203     /**
204      * In HBase, the last row is represented by an empty byte array. Set this
205      * value to help the split code understand how to evenly divide the last
206      * region. Note that this last row is inclusive for all rows sharing the
207      * same prefix.
208      *
209      * @param userInput
210      *          raw user input (may throw RuntimeException on parse failure)
211      */
212     void setLastRow(String userInput);
213 
214     /**
215      * @param input
216      *          user or file input for row
217      * @return byte array representation of this row for HBase
218      */
219     byte[] strToRow(String input);
220 
221     /**
222      * @param row
223      *          byte array representing a row in HBase
224      * @return String to use for debug & file printing
225      */
226     String rowToStr(byte[] row);
227 
228     /**
229      * @return the separator character to use when storing / printing the row
230      */
231     String separator();
232   }
233 
234   /**
235    * The main function for the RegionSplitter application. Common uses:
236    * <p>
237    * <ul>
238    * <li>create a table named 'myTable' with 60 pre-split regions containing 2
239    * column families 'test' & 'rs', assuming the keys are hex-encoded ASCII:
240    * <ul>
241    * <li>bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -c 60 -f test:rs
242    * myTable HexStringSplit
243    * </ul>
244    * <li>perform a rolling split of 'myTable' (i.e. 60 => 120 regions), # 2
245    * outstanding splits at a time, assuming keys are uniformly distributed
246    * bytes:
247    * <ul>
248    * <li>bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -r -o 2 myTable
249    * UniformSplit
250    * </ul>
251    * </ul>
252    *
253    * There are two SplitAlgorithms built into RegionSplitter, HexStringSplit
254    * and UniformSplit. These are different strategies for choosing region
255    * boundaries. See their source code for details.
256    *
257    * @param args
258    *          Usage: RegionSplitter &lt;TABLE&gt; &lt;SPLITALGORITHM&gt;
259    *          &lt;-c &lt;# regions&gt; -f &lt;family:family:...&gt; | -r
260    *          [-o &lt;# outstanding splits&gt;]&gt;
261    *          [-D &lt;conf.param=value&gt;]
262    * @throws IOException
263    *           HBase IO problem
264    * @throws InterruptedException
265    *           user requested exit
266    * @throws ParseException
267    *           problem parsing user input
268    */
269   @SuppressWarnings("static-access")
270   public static void main(String[] args) throws IOException,
271       InterruptedException, ParseException {
272     Configuration conf = HBaseConfiguration.create();
273 
274     // parse user input
275     Options opt = new Options();
276     opt.addOption(OptionBuilder.withArgName("property=value").hasArg()
277         .withDescription("Override HBase Configuration Settings").create("D"));
278     opt.addOption(OptionBuilder.withArgName("region count").hasArg()
279         .withDescription(
280             "Create a new table with a pre-split number of regions")
281         .create("c"));
282     opt.addOption(OptionBuilder.withArgName("family:family:...").hasArg()
283         .withDescription(
284             "Column Families to create with new table.  Required with -c")
285         .create("f"));
286     opt.addOption("h", false, "Print this usage help");
287     opt.addOption("r", false, "Perform a rolling split of an existing region");
288     opt.addOption(OptionBuilder.withArgName("count").hasArg().withDescription(
289         "Max outstanding splits that have unfinished major compactions")
290         .create("o"));
291     opt.addOption(null, "firstrow", true,
292         "First Row in Table for Split Algorithm");
293     opt.addOption(null, "lastrow", true,
294         "Last Row in Table for Split Algorithm");
295     opt.addOption(null, "risky", false,
296         "Skip verification steps to complete quickly."
297             + "STRONGLY DISCOURAGED for production systems.  ");
298     CommandLine cmd = new GnuParser().parse(opt, args);
299 
300     if (cmd.hasOption("D")) {
301       for (String confOpt : cmd.getOptionValues("D")) {
302         String[] kv = confOpt.split("=", 2);
303         if (kv.length == 2) {
304           conf.set(kv[0], kv[1]);
305           LOG.debug("-D configuration override: " + kv[0] + "=" + kv[1]);
306         } else {
307           throw new ParseException("-D option format invalid: " + confOpt);
308         }
309       }
310     }
311 
312     if (cmd.hasOption("risky")) {
313       conf.setBoolean("split.verify", false);
314     }
315 
316     boolean createTable = cmd.hasOption("c") && cmd.hasOption("f");
317     boolean rollingSplit = cmd.hasOption("r");
318     boolean oneOperOnly = createTable ^ rollingSplit;
319 
320     if (2 != cmd.getArgList().size() || !oneOperOnly || cmd.hasOption("h")) {
321       new HelpFormatter().printHelp("RegionSplitter <TABLE> <SPLITALGORITHM>\n"+
322 		  "SPLITALGORITHM is a java class name of a class implementing " +
323 		  "SplitAlgorithm, or one of the special strings HexStringSplit " +
324 		  "or UniformSplit, which are built-in split algorithms. " +
325 		  "HexStringSplit treats keys as hexadecimal ASCII, and " +
326 		  "UniformSplit treats keys as arbitrary bytes.", opt);
327       return;
328     }
329     String tableName = cmd.getArgs()[0];
330     String splitClass = cmd.getArgs()[1];
331     SplitAlgorithm splitAlgo = newSplitAlgoInstance(conf, splitClass);
332 
333     if (cmd.hasOption("firstrow")) {
334       splitAlgo.setFirstRow(cmd.getOptionValue("firstrow"));
335     }
336     if (cmd.hasOption("lastrow")) {
337       splitAlgo.setLastRow(cmd.getOptionValue("lastrow"));
338     }
339 
340     if (createTable) {
341       conf.set("split.count", cmd.getOptionValue("c"));
342       createPresplitTable(tableName, splitAlgo, cmd.getOptionValue("f").split(":"), conf);
343     }
344 
345     if (rollingSplit) {
346       if (cmd.hasOption("o")) {
347         conf.set("split.outstanding", cmd.getOptionValue("o"));
348       }
349       rollingSplit(tableName, splitAlgo, conf);
350     }
351   }
352 
353   static void createPresplitTable(String tableName, SplitAlgorithm splitAlgo,
354           String[] columnFamilies, Configuration conf) throws IOException,
355           InterruptedException {
356     final int splitCount = conf.getInt("split.count", 0);
357     Preconditions.checkArgument(splitCount > 1, "Split count must be > 1");
358 
359     Preconditions.checkArgument(columnFamilies.length > 0,
360         "Must specify at least one column family. ");
361     LOG.debug("Creating table " + tableName + " with " + columnFamilies.length
362         + " column families.  Presplitting to " + splitCount + " regions");
363 
364     HTableDescriptor desc = new HTableDescriptor(tableName);
365     for (String cf : columnFamilies) {
366       desc.addFamily(new HColumnDescriptor(Bytes.toBytes(cf)));
367     }
368     HBaseAdmin admin = new HBaseAdmin(conf);
369     Preconditions.checkArgument(!admin.tableExists(tableName),
370         "Table already exists: " + tableName);
371     admin.createTable(desc, splitAlgo.split(splitCount));
372     LOG.debug("Table created!  Waiting for regions to show online in META...");
373 
374     if (!conf.getBoolean("split.verify", true)) {
375       // NOTE: createTable is synchronous on the table, but not on the regions
376       HTable table = new HTable(conf, tableName);
377       int onlineRegions = 0;
378       while (onlineRegions < splitCount) {
379         onlineRegions = table.getRegionsInfo().size();
380         LOG.debug(onlineRegions + " of " + splitCount + " regions online...");
381         if (onlineRegions < splitCount) {
382           Thread.sleep(10 * 1000); // sleep
383         }
384       }
385       table.close();
386     }
387 
388     LOG.debug("Finished creating table with " + splitCount + " regions");
389   }
390 
391   static void rollingSplit(String tableName, SplitAlgorithm splitAlgo,
392           Configuration conf) throws IOException, InterruptedException {
393     final int minOS = conf.getInt("split.outstanding", 2);
394 
395     HTable table = new HTable(conf, tableName);
396 
397     // max outstanding splits. default == 50% of servers
398     final int MAX_OUTSTANDING =
399         Math.max(table.getConnection().getCurrentNrHRS() / 2, minOS);
400 
401     Path hbDir = new Path(conf.get(HConstants.HBASE_DIR));
402     Path tableDir = HTableDescriptor.getTableDir(hbDir, table.getTableName());
403     Path splitFile = new Path(tableDir, "_balancedSplit");
404     FileSystem fs = FileSystem.get(conf);
405 
406     // get a list of daughter regions to create
407     LinkedList<Pair<byte[], byte[]>> tmpRegionSet = getSplits(table, splitAlgo);
408     LinkedList<Pair<byte[], byte[]>> outstanding = Lists.newLinkedList();
409     int splitCount = 0;
410     final int origCount = tmpRegionSet.size();
411 
412     // all splits must compact & we have 1 compact thread, so 2 split
413     // requests to the same RS can stall the outstanding split queue.
414     // To fix, group the regions into an RS pool and round-robin through it
415     LOG.debug("Bucketing regions by regionserver...");
416     TreeMap<String, LinkedList<Pair<byte[], byte[]>>> daughterRegions =
417       Maps.newTreeMap();
418     for (Pair<byte[], byte[]> dr : tmpRegionSet) {
419       String rsLocation = table.getRegionLocation(dr.getSecond()).
420         getHostnamePort();
421       if (!daughterRegions.containsKey(rsLocation)) {
422         LinkedList<Pair<byte[], byte[]>> entry = Lists.newLinkedList();
423         daughterRegions.put(rsLocation, entry);
424       }
425       daughterRegions.get(rsLocation).add(dr);
426     }
427     LOG.debug("Done with bucketing.  Split time!");
428     long startTime = System.currentTimeMillis();
429 
430     // open the split file and modify it as splits finish
431     FSDataInputStream tmpIn = fs.open(splitFile);
432     byte[] rawData = new byte[tmpIn.available()];
433     tmpIn.readFully(rawData);
434     tmpIn.close();
435     FSDataOutputStream splitOut = fs.create(splitFile);
436     splitOut.write(rawData);
437 
438     try {
439       // *** split code ***
440       while (!daughterRegions.isEmpty()) {
441         LOG.debug(daughterRegions.size() + " RS have regions to splt.");
442 
443         // Get RegionServer : region count mapping
444         final TreeMap<ServerName, Integer> rsSizes = Maps.newTreeMap();
445         Map<HRegionInfo, ServerName> regionsInfo = table.getRegionLocations();
446         for (ServerName rs : regionsInfo.values()) {
447           if (rsSizes.containsKey(rs)) {
448             rsSizes.put(rs, rsSizes.get(rs) + 1);
449           } else {
450             rsSizes.put(rs, 1);
451           }
452         }
453 
454         // sort the RS by the number of regions they have
455         List<String> serversLeft = Lists.newArrayList(daughterRegions .keySet());
456         Collections.sort(serversLeft, new Comparator<String>() {
457           public int compare(String o1, String o2) {
458             return rsSizes.get(o1).compareTo(rsSizes.get(o2));
459           }
460         });
461 
462         // round-robin through the RS list. Choose the lightest-loaded servers
463         // first to keep the master from load-balancing regions as we split.
464         for (String rsLoc : serversLeft) {
465           Pair<byte[], byte[]> dr = null;
466 
467           // find a region in the RS list that hasn't been moved
468           LOG.debug("Finding a region on " + rsLoc);
469           LinkedList<Pair<byte[], byte[]>> regionList = daughterRegions
470               .get(rsLoc);
471           while (!regionList.isEmpty()) {
472             dr = regionList.pop();
473 
474             // get current region info
475             byte[] split = dr.getSecond();
476             HRegionLocation regionLoc = table.getRegionLocation(split);
477 
478             // if this region moved locations
479             String newRs = regionLoc.getHostnamePort();
480             if (newRs.compareTo(rsLoc) != 0) {
481               LOG.debug("Region with " + splitAlgo.rowToStr(split)
482                   + " moved to " + newRs + ". Relocating...");
483               // relocate it, don't use it right now
484               if (!daughterRegions.containsKey(newRs)) {
485                 LinkedList<Pair<byte[], byte[]>> entry = Lists.newLinkedList();
486                 daughterRegions.put(newRs, entry);
487               }
488               daughterRegions.get(newRs).add(dr);
489               dr = null;
490               continue;
491             }
492 
493             // make sure this region wasn't already split
494             byte[] sk = regionLoc.getRegionInfo().getStartKey();
495             if (sk.length != 0) {
496               if (Bytes.equals(split, sk)) {
497                 LOG.debug("Region already split on "
498                     + splitAlgo.rowToStr(split) + ".  Skipping this region...");
499                 ++splitCount;
500                 dr = null;
501                 continue;
502               }
503               byte[] start = dr.getFirst();
504               Preconditions.checkArgument(Bytes.equals(start, sk), splitAlgo
505                   .rowToStr(start) + " != " + splitAlgo.rowToStr(sk));
506             }
507 
508             // passed all checks! found a good region
509             break;
510           }
511           if (regionList.isEmpty()) {
512             daughterRegions.remove(rsLoc);
513           }
514           if (dr == null)
515             continue;
516 
517           // we have a good region, time to split!
518           byte[] split = dr.getSecond();
519           LOG.debug("Splitting at " + splitAlgo.rowToStr(split));
520           HBaseAdmin admin = new HBaseAdmin(table.getConfiguration());
521           admin.split(table.getTableName(), split);
522 
523           LinkedList<Pair<byte[], byte[]>> finished = Lists.newLinkedList();
524           if (conf.getBoolean("split.verify", true)) {
525             // we need to verify and rate-limit our splits
526             outstanding.addLast(dr);
527             // with too many outstanding splits, wait for some to finish
528             while (outstanding.size() >= MAX_OUTSTANDING) {
529               finished = splitScan(outstanding, table, splitAlgo);
530               if (finished.isEmpty()) {
531                 Thread.sleep(30 * 1000);
532               } else {
533                 outstanding.removeAll(finished);
534               }
535             }
536           } else {
537             finished.add(dr);
538           }
539 
540           // mark each finished region as successfully split.
541           for (Pair<byte[], byte[]> region : finished) {
542             splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst())
543                 + " " + splitAlgo.rowToStr(region.getSecond()) + "\n");
544             splitCount++;
545             if (splitCount % 10 == 0) {
546               long tDiff = (System.currentTimeMillis() - startTime)
547                   / splitCount;
548               LOG.debug("STATUS UPDATE: " + splitCount + " / " + origCount
549                   + ". Avg Time / Split = "
550                   + org.apache.hadoop.util.StringUtils.formatTime(tDiff));
551             }
552           }
553         }
554       }
555       if (conf.getBoolean("split.verify", true)) {
556         while (!outstanding.isEmpty()) {
557           LinkedList<Pair<byte[], byte[]>> finished = splitScan(outstanding,
558               table, splitAlgo);
559           if (finished.isEmpty()) {
560             Thread.sleep(30 * 1000);
561           } else {
562             outstanding.removeAll(finished);
563             for (Pair<byte[], byte[]> region : finished) {
564               splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst())
565                   + " " + splitAlgo.rowToStr(region.getSecond()) + "\n");
566             }
567           }
568         }
569       }
570       LOG.debug("All regions have been successfully split!");
571     } finally {
572       long tDiff = System.currentTimeMillis() - startTime;
573       LOG.debug("TOTAL TIME = "
574           + org.apache.hadoop.util.StringUtils.formatTime(tDiff));
575       LOG.debug("Splits = " + splitCount);
576       LOG.debug("Avg Time / Split = "
577           + org.apache.hadoop.util.StringUtils.formatTime(tDiff / splitCount));
578 
579       splitOut.close();
580       if (table != null){
581         table.close();
582       }
583     }
584     fs.delete(splitFile, false);
585   }
586 
587   /**
588    * @throws IOException if the specified SplitAlgorithm class couldn't be
589    * instantiated
590    */
591   public static SplitAlgorithm newSplitAlgoInstance(Configuration conf,
592           String splitClassName) throws IOException {
593     Class<?> splitClass;
594 
595     // For split algorithms builtin to RegionSplitter, the user can specify
596     // their simple class name instead of a fully qualified class name.
597     if(splitClassName.equals(HexStringSplit.class.getSimpleName())) {
598       splitClass = HexStringSplit.class;
599     } else if (splitClassName.equals(UniformSplit.class.getSimpleName())) {
600       splitClass = UniformSplit.class;
601     } else {
602       try {
603         splitClass = conf.getClassByName(splitClassName);
604       } catch (ClassNotFoundException e) {
605         throw new IOException("Couldn't load split class " + splitClassName, e);
606       }
607       if(splitClass == null) {
608         throw new IOException("Failed loading split class " + splitClassName);
609       }
610       if(!SplitAlgorithm.class.isAssignableFrom(splitClass)) {
611         throw new IOException(
612                 "Specified split class doesn't implement SplitAlgorithm");
613       }
614     }
615     try {
616       return splitClass.asSubclass(SplitAlgorithm.class).newInstance();
617     } catch (Exception e) {
618       throw new IOException("Problem loading split algorithm: ", e);
619     }
620   }
621 
622   static LinkedList<Pair<byte[], byte[]>> splitScan(
623       LinkedList<Pair<byte[], byte[]>> regionList, HTable table,
624       SplitAlgorithm splitAlgo)
625       throws IOException, InterruptedException {
626     LinkedList<Pair<byte[], byte[]>> finished = Lists.newLinkedList();
627     LinkedList<Pair<byte[], byte[]>> logicalSplitting = Lists.newLinkedList();
628     LinkedList<Pair<byte[], byte[]>> physicalSplitting = Lists.newLinkedList();
629 
630     // get table info
631     Path hbDir = new Path(table.getConfiguration().get(HConstants.HBASE_DIR));
632     Path tableDir = HTableDescriptor.getTableDir(hbDir, table.getTableName());
633     Path splitFile = new Path(tableDir, "_balancedSplit");
634     FileSystem fs = tableDir.getFileSystem(table.getConfiguration());
635 
636     // clear the cache to forcibly refresh region information
637     table.clearRegionCache();
638 
639     // for every region that hasn't been verified as a finished split
640     for (Pair<byte[], byte[]> region : regionList) {
641       byte[] start = region.getFirst();
642       byte[] split = region.getSecond();
643 
644       // see if the new split daughter region has come online
645       try {
646         HRegionInfo dri = table.getRegionLocation(split).getRegionInfo();
647         if (dri.isOffline() || !Bytes.equals(dri.getStartKey(), split)) {
648           logicalSplitting.add(region);
649           continue;
650         }
651       } catch (NoServerForRegionException nsfre) {
652         // NSFRE will occur if the old META entry has no server assigned
653         LOG.info(nsfre);
654         logicalSplitting.add(region);
655         continue;
656       }
657 
658       try {
659         // when a daughter region is opened, a compaction is triggered
660         // wait until compaction completes for both daughter regions
661         LinkedList<HRegionInfo> check = Lists.newLinkedList();
662         check.add(table.getRegionLocation(start).getRegionInfo());
663         check.add(table.getRegionLocation(split).getRegionInfo());
664         for (HRegionInfo hri : check.toArray(new HRegionInfo[] {})) {
665           boolean refFound = false;
666           byte[] sk = hri.getStartKey();
667           if (sk.length == 0)
668             sk = splitAlgo.firstRow();
669           String startKey = splitAlgo.rowToStr(sk);
670           HTableDescriptor htd = table.getTableDescriptor();
671           // check every Column Family for that region
672           for (HColumnDescriptor c : htd.getFamilies()) {
673             Path cfDir = Store.getStoreHomedir(tableDir, hri.getEncodedName(),
674                 c.getName());
675             if (fs.exists(cfDir)) {
676               for (FileStatus file : fs.listStatus(cfDir)) {
677                 refFound |= StoreFile.isReference(file.getPath());
678                 if (refFound)
679                   break;
680               }
681             }
682             if (refFound)
683               break;
684           }
685           // compaction is completed when all reference files are gone
686           if (!refFound) {
687             check.remove(hri);
688           }
689         }
690         if (check.isEmpty()) {
691           finished.add(region);
692         } else {
693           physicalSplitting.add(region);
694         }
695       } catch (NoServerForRegionException nsfre) {
696         LOG.debug("No Server Exception thrown for: "
697             + splitAlgo.rowToStr(start));
698         physicalSplitting.add(region);
699         table.clearRegionCache();
700       }
701     }
702 
703     LOG.debug("Split Scan: " + finished.size() + " finished / "
704         + logicalSplitting.size() + " split wait / "
705         + physicalSplitting.size() + " reference wait");
706 
707     return finished;
708   }
709 
710   static LinkedList<Pair<byte[], byte[]>> getSplits(HTable table,
711       SplitAlgorithm splitAlgo) throws IOException {
712     Path hbDir = new Path(table.getConfiguration().get(HConstants.HBASE_DIR));
713     Path tableDir = HTableDescriptor.getTableDir(hbDir, table.getTableName());
714     Path splitFile = new Path(tableDir, "_balancedSplit");
715     FileSystem fs = tableDir.getFileSystem(table.getConfiguration());
716 
717     // using strings because (new byte[]{0}).equals(new byte[]{0}) == false
718     Set<Pair<String, String>> daughterRegions = Sets.newHashSet();
719 
720     // does a split file exist?
721     if (!fs.exists(splitFile)) {
722       // NO = fresh start. calculate splits to make
723       LOG.debug("No _balancedSplit file.  Calculating splits...");
724 
725       // query meta for all regions in the table
726       Set<Pair<byte[], byte[]>> rows = Sets.newHashSet();
727       Pair<byte[][], byte[][]> tmp = table.getStartEndKeys();
728       Preconditions.checkArgument(
729           tmp.getFirst().length == tmp.getSecond().length,
730           "Start and End rows should be equivalent");
731       for (int i = 0; i < tmp.getFirst().length; ++i) {
732         byte[] start = tmp.getFirst()[i], end = tmp.getSecond()[i];
733         if (start.length == 0)
734           start = splitAlgo.firstRow();
735         if (end.length == 0)
736           end = splitAlgo.lastRow();
737         rows.add(Pair.newPair(start, end));
738       }
739       LOG.debug("Table " + Bytes.toString(table.getTableName()) + " has "
740           + rows.size() + " regions that will be split.");
741 
742       // prepare the split file
743       Path tmpFile = new Path(tableDir, "_balancedSplit_prepare");
744       FSDataOutputStream tmpOut = fs.create(tmpFile);
745 
746       // calculate all the splits == [daughterRegions] = [(start, splitPoint)]
747       for (Pair<byte[], byte[]> r : rows) {
748         byte[] splitPoint = splitAlgo.split(r.getFirst(), r.getSecond());
749         String startStr = splitAlgo.rowToStr(r.getFirst());
750         String splitStr = splitAlgo.rowToStr(splitPoint);
751         daughterRegions.add(Pair.newPair(startStr, splitStr));
752         LOG.debug("Will Split [" + startStr + " , "
753             + splitAlgo.rowToStr(r.getSecond()) + ") at " + splitStr);
754         tmpOut.writeChars("+ " + startStr + splitAlgo.separator() + splitStr
755             + "\n");
756       }
757       tmpOut.close();
758       fs.rename(tmpFile, splitFile);
759     } else {
760       LOG.debug("_balancedSplit file found. Replay log to restore state...");
761       FSUtils.getInstance(fs, table.getConfiguration())
762         .recoverFileLease(fs, splitFile, table.getConfiguration());
763 
764       // parse split file and process remaining splits
765       FSDataInputStream tmpIn = fs.open(splitFile);
766       StringBuilder sb = new StringBuilder(tmpIn.available());
767       while (tmpIn.available() > 0) {
768         sb.append(tmpIn.readChar());
769       }
770       tmpIn.close();
771       for (String line : sb.toString().split("\n")) {
772         String[] cmd = line.split(splitAlgo.separator());
773         Preconditions.checkArgument(3 == cmd.length);
774         byte[] start = splitAlgo.strToRow(cmd[1]);
775         String startStr = splitAlgo.rowToStr(start);
776         byte[] splitPoint = splitAlgo.strToRow(cmd[2]);
777         String splitStr = splitAlgo.rowToStr(splitPoint);
778         Pair<String, String> r = Pair.newPair(startStr, splitStr);
779         if (cmd[0].equals("+")) {
780           LOG.debug("Adding: " + r);
781           daughterRegions.add(r);
782         } else {
783           LOG.debug("Removing: " + r);
784           Preconditions.checkArgument(cmd[0].equals("-"),
785               "Unknown option: " + cmd[0]);
786           Preconditions.checkState(daughterRegions.contains(r),
787               "Missing row: " + r);
788           daughterRegions.remove(r);
789         }
790       }
791       LOG.debug("Done reading. " + daughterRegions.size() + " regions left.");
792     }
793     LinkedList<Pair<byte[], byte[]>> ret = Lists.newLinkedList();
794     for (Pair<String, String> r : daughterRegions) {
795       ret.add(Pair.newPair(splitAlgo.strToRow(r.getFirst()), splitAlgo
796           .strToRow(r.getSecond())));
797     }
798     return ret;
799   }
800 
801   /**
802    * HexStringSplit is a well-known {@link SplitAlgorithm} for choosing region
803    * boundaries. The format of a HexStringSplit region boundary is the ASCII
804    * representation of an MD5 checksum, or any other uniformly distributed
805    * hexadecimal value. Row are hex-encoded long values in the range
806    * <b>"00000000" => "FFFFFFFF"</b> and are left-padded with zeros to keep the
807    * same order lexicographically as if they were binary.
808    *
809    * Since this split algorithm uses hex strings as keys, it is easy to read &
810    * write in the shell but takes up more space and may be non-intuitive.
811    */
812   public static class HexStringSplit implements SplitAlgorithm {
813     final static String DEFAULT_MIN_HEX = "00000000";
814     final static String DEFAULT_MAX_HEX = "FFFFFFFF";
815 
816     String firstRow = DEFAULT_MIN_HEX;
817     BigInteger firstRowInt = BigInteger.ZERO;
818     String lastRow = DEFAULT_MAX_HEX;
819     BigInteger lastRowInt = new BigInteger(lastRow, 16);
820     int rowComparisonLength = lastRow.length();
821 
822     public byte[] split(byte[] start, byte[] end) {
823       BigInteger s = convertToBigInteger(start);
824       BigInteger e = convertToBigInteger(end);
825       Preconditions.checkArgument(!e.equals(BigInteger.ZERO));
826       return convertToByte(split2(s, e));
827     }
828 
829     public byte[][] split(int n) {
830       Preconditions.checkArgument(lastRowInt.compareTo(firstRowInt) > 0,
831           "last row (%s) is configured less than first row (%s)", lastRow,
832           firstRow);
833       // +1 to range because the last row is inclusive
834       BigInteger range = lastRowInt.subtract(firstRowInt).add(BigInteger.ONE);
835       Preconditions.checkState(range.compareTo(BigInteger.valueOf(n)) >= 0,
836           "split granularity (%s) is greater than the range (%s)", n, range);
837 
838       BigInteger[] splits = new BigInteger[n - 1];
839       BigInteger sizeOfEachSplit = range.divide(BigInteger.valueOf(n));
840       for (int i = 1; i < n; i++) {
841         // NOTE: this means the last region gets all the slop.
842         // This is not a big deal if we're assuming n << MAXHEX
843         splits[i - 1] = firstRowInt.add(sizeOfEachSplit.multiply(BigInteger
844             .valueOf(i)));
845       }
846       return convertToBytes(splits);
847     }
848 
849     public byte[] firstRow() {
850       return convertToByte(firstRowInt);
851     }
852 
853     public byte[] lastRow() {
854       return convertToByte(lastRowInt);
855     }
856 
857     public void setFirstRow(String userInput) {
858       firstRow = userInput;
859       firstRowInt = new BigInteger(firstRow, 16);
860     }
861 
862     public void setLastRow(String userInput) {
863       lastRow = userInput;
864       lastRowInt = new BigInteger(lastRow, 16);
865       // Precondition: lastRow > firstRow, so last's length is the greater
866       rowComparisonLength = lastRow.length();
867     }
868 
869     public byte[] strToRow(String in) {
870       return convertToByte(new BigInteger(in, 16));
871     }
872 
873     public String rowToStr(byte[] row) {
874       return Bytes.toStringBinary(row);
875     }
876 
877     public String separator() {
878       return " ";
879     }
880 
881     /**
882      * Divide 2 numbers in half (for split algorithm)
883      *
884      * @param a number #1
885      * @param b number #2
886      * @return the midpoint of the 2 numbers
887      */
888     public BigInteger split2(BigInteger a, BigInteger b) {
889       return a.add(b).divide(BigInteger.valueOf(2)).abs();
890     }
891 
892     /**
893      * Returns an array of bytes corresponding to an array of BigIntegers
894      *
895      * @param bigIntegers numbers to convert
896      * @return bytes corresponding to the bigIntegers
897      */
898     public byte[][] convertToBytes(BigInteger[] bigIntegers) {
899       byte[][] returnBytes = new byte[bigIntegers.length][];
900       for (int i = 0; i < bigIntegers.length; i++) {
901         returnBytes[i] = convertToByte(bigIntegers[i]);
902       }
903       return returnBytes;
904     }
905 
906     /**
907      * Returns the bytes corresponding to the BigInteger
908      *
909      * @param bigInteger number to convert
910      * @param pad padding length
911      * @return byte corresponding to input BigInteger
912      */
913     public static byte[] convertToByte(BigInteger bigInteger, int pad) {
914       String bigIntegerString = bigInteger.toString(16);
915       bigIntegerString = StringUtils.leftPad(bigIntegerString, pad, '0');
916       return Bytes.toBytes(bigIntegerString);
917     }
918 
919     /**
920      * Returns the bytes corresponding to the BigInteger
921      *
922      * @param bigInteger number to convert
923      * @return corresponding bytes
924      */
925     public byte[] convertToByte(BigInteger bigInteger) {
926       return convertToByte(bigInteger, rowComparisonLength);
927     }
928 
929     /**
930      * Returns the BigInteger represented by the byte array
931      *
932      * @param row byte array representing row
933      * @return the corresponding BigInteger
934      */
935     public BigInteger convertToBigInteger(byte[] row) {
936       return (row.length > 0) ? new BigInteger(Bytes.toString(row), 16)
937           : BigInteger.ZERO;
938     }
939 
940     @Override
941     public String toString() {
942       return this.getClass().getSimpleName() + " [" + rowToStr(firstRow())
943           + "," + rowToStr(lastRow()) + "]";
944     }
945   }
946 
947   /**
948    * A SplitAlgorithm that divides the space of possible keys evenly. Useful
949    * when the keys are approximately uniform random bytes (e.g. hashes). Rows
950    * are raw byte values in the range <b>00 => FF</b> and are right-padded with
951    * zeros to keep the same memcmp() order. This is the natural algorithm to use
952    * for a byte[] environment and saves space, but is not necessarily the
953    * easiest for readability.
954    */
955   public static class UniformSplit implements SplitAlgorithm {
956     static final byte xFF = (byte) 0xFF;
957     byte[] firstRowBytes = ArrayUtils.EMPTY_BYTE_ARRAY;
958     byte[] lastRowBytes =
959             new byte[] {xFF, xFF, xFF, xFF, xFF, xFF, xFF, xFF};
960     public byte[] split(byte[] start, byte[] end) {
961       return Bytes.split(start, end, 1)[1];
962     }
963 
964     @Override
965     public byte[][] split(int numRegions) {
966       Preconditions.checkArgument(
967           Bytes.compareTo(lastRowBytes, firstRowBytes) > 0,
968           "last row (%s) is configured less than first row (%s)",
969           Bytes.toStringBinary(lastRowBytes),
970           Bytes.toStringBinary(firstRowBytes));
971 
972       byte[][] splits = Bytes.split(firstRowBytes, lastRowBytes, true,
973           numRegions - 1);
974       Preconditions.checkState(splits != null,
975           "Could not split region with given user input: " + this);
976 
977       // remove endpoints, which are included in the splits list
978       return Arrays.copyOfRange(splits, 1, splits.length - 1);
979     }
980 
981     @Override
982     public byte[] firstRow() {
983       return firstRowBytes;
984     }
985 
986     @Override
987     public byte[] lastRow() {
988       return lastRowBytes;
989     }
990 
991     @Override
992     public void setFirstRow(String userInput) {
993       firstRowBytes = Bytes.toBytesBinary(userInput);
994     }
995 
996     @Override
997     public void setLastRow(String userInput) {
998       lastRowBytes = Bytes.toBytesBinary(userInput);
999     }
1000 
1001     @Override
1002     public byte[] strToRow(String input) {
1003       return Bytes.toBytesBinary(input);
1004     }
1005 
1006     @Override
1007     public String rowToStr(byte[] row) {
1008       return Bytes.toStringBinary(row);
1009     }
1010 
1011     @Override
1012     public String separator() {
1013       return ",";
1014     }
1015 
1016     @Override
1017     public String toString() {
1018       return this.getClass().getSimpleName() + " [" + rowToStr(firstRow())
1019           + "," + rowToStr(lastRow()) + "]";
1020     }
1021   }
1022 }