View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import java.io.IOException;
22  import java.math.BigInteger;
23  import java.util.Arrays;
24  import java.util.Collections;
25  import java.util.Comparator;
26  import java.util.LinkedList;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.cli.CommandLine;
33  import org.apache.commons.cli.GnuParser;
34  import org.apache.commons.cli.HelpFormatter;
35  import org.apache.commons.cli.OptionBuilder;
36  import org.apache.commons.cli.Options;
37  import org.apache.commons.cli.ParseException;
38  import org.apache.commons.lang.ArrayUtils;
39  import org.apache.commons.lang.StringUtils;
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.hadoop.hbase.classification.InterfaceAudience;
43  import org.apache.hadoop.conf.Configuration;
44  import org.apache.hadoop.fs.FSDataInputStream;
45  import org.apache.hadoop.fs.FSDataOutputStream;
46  import org.apache.hadoop.fs.FileSystem;
47  import org.apache.hadoop.fs.Path;
48  import org.apache.hadoop.hbase.HBaseConfiguration;
49  import org.apache.hadoop.hbase.HColumnDescriptor;
50  import org.apache.hadoop.hbase.HRegionInfo;
51  import org.apache.hadoop.hbase.HRegionLocation;
52  import org.apache.hadoop.hbase.HTableDescriptor;
53  import org.apache.hadoop.hbase.ServerName;
54  import org.apache.hadoop.hbase.TableName;
55  import org.apache.hadoop.hbase.MetaTableAccessor;
56  import org.apache.hadoop.hbase.client.HBaseAdmin;
57  import org.apache.hadoop.hbase.client.HTable;
58  import org.apache.hadoop.hbase.client.NoServerForRegionException;
59  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
60  
61  import com.google.common.base.Preconditions;
62  import com.google.common.collect.Lists;
63  import com.google.common.collect.Maps;
64  import com.google.common.collect.Sets;
65  
66  /**
67   * The {@link RegionSplitter} class provides several utilities to help in the
68   * administration lifecycle for developers who choose to manually split regions
69   * instead of having HBase handle that automatically. The most useful utilities
70   * are:
71   * <p>
72   * <ul>
73   * <li>Create a table with a specified number of pre-split regions
74   * <li>Execute a rolling split of all regions on an existing table
75   * </ul>
76   * <p>
77   * Both operations can be safely done on a live server.
78   * <p>
79   * <b>Question:</b> How do I turn off automatic splitting? <br>
80   * <b>Answer:</b> Automatic splitting is determined by the configuration value
81   * <i>HConstants.HREGION_MAX_FILESIZE</i>. It is not recommended that you set this
82   * to Long.MAX_VALUE in case you forget about manual splits. A suggested setting
83   * is 100GB, which would result in > 1hr major compactions if reached.
84   * <p>
85   * <b>Question:</b> Why did the original authors decide to manually split? <br>
86   * <b>Answer:</b> Specific workload characteristics of our use case allowed us
87   * to benefit from a manual split system.
88   * <p>
89   * <ul>
90   * <li>Data (~1k) that would grow instead of being replaced
91   * <li>Data growth was roughly uniform across all regions
92   * <li>OLTP workload. Data loss is a big deal.
93   * </ul>
94   * <p>
95   * <b>Question:</b> Why is manual splitting good for this workload? <br>
96   * <b>Answer:</b> Although automated splitting is not a bad option, there are
97   * benefits to manual splitting.
98   * <p>
99   * <ul>
100  * <li>With growing amounts of data, splits will continually be needed. Since
101  * you always know exactly what regions you have, long-term debugging and
102  * profiling is much easier with manual splits. It is hard to trace the logs to
103  * understand region level problems if it keeps splitting and getting renamed.
104  * <li>Data offlining bugs + unknown number of split regions == oh crap! If an
105  * HLog or StoreFile was mistakenly unprocessed by HBase due to a weird bug and
106  * you notice it a day or so later, you can be assured that the regions
107  * specified in these files are the same as the current regions and you have
108  * less headaches trying to restore/replay your data.
109  * <li>You can finely tune your compaction algorithm. With roughly uniform data
110  * growth, it's easy to cause split / compaction storms as the regions all
111  * roughly hit the same data size at the same time. With manual splits, you can
112  * let staggered, time-based major compactions spread out your network IO load.
113  * </ul>
114  * <p>
115  * <b>Question:</b> What's the optimal number of pre-split regions to create? <br>
116  * <b>Answer:</b> Mileage will vary depending upon your application.
117  * <p>
118  * The short answer for our application is that we started with 10 pre-split
119  * regions / server and watched our data growth over time. It's better to err on
120  * the side of too little regions and rolling split later.
121  * <p>
122  * The more complicated answer is that this depends upon the largest storefile
123  * in your region. With a growing data size, this will get larger over time. You
124  * want the largest region to be just big enough that the
125  * {@link org.apache.hadoop.hbase.regionserver.HStore} compact
126  * selection algorithm only compacts it due to a timed major. If you don't, your
127  * cluster can be prone to compaction storms as the algorithm decides to run
128  * major compactions on a large series of regions all at once. Note that
129  * compaction storms are due to the uniform data growth, not the manual split
130  * decision.
131  * <p>
132  * If you pre-split your regions too thin, you can increase the major compaction
133  * interval by configuring HConstants.MAJOR_COMPACTION_PERIOD. If your data size
134  * grows too large, use this script to perform a network IO safe rolling split
135  * of all regions.
136  */
137 @InterfaceAudience.Private
138 public class RegionSplitter {
139   static final Log LOG = LogFactory.getLog(RegionSplitter.class);
140 
141   /**
142    * A generic interface for the RegionSplitter code to use for all it's
143    * functionality. Note that the original authors of this code use
144    * {@link HexStringSplit} to partition their table and set it as default, but
145    * provided this for your custom algorithm. To use, create a new derived class
146    * from this interface and call {@link RegionSplitter#createPresplitTable} or
147    * RegionSplitter#rollingSplit(TableName, SplitAlgorithm, Configuration) with the
148    * argument splitClassName giving the name of your class.
149    */
150   public interface SplitAlgorithm {
151     /**
152      * Split a pre-existing region into 2 regions.
153      *
154      * @param start
155      *          first row (inclusive)
156      * @param end
157      *          last row (exclusive)
158      * @return the split row to use
159      */
160     byte[] split(byte[] start, byte[] end);
161 
162     /**
163      * Split an entire table.
164      *
165      * @param numRegions
166      *          number of regions to split the table into
167      *
168      * @throws RuntimeException
169      *           user input is validated at this time. may throw a runtime
170      *           exception in response to a parse failure
171      * @return array of split keys for the initial regions of the table. The
172      *         length of the returned array should be numRegions-1.
173      */
174     byte[][] split(int numRegions);
175 
176     /**
177      * In HBase, the first row is represented by an empty byte array. This might
178      * cause problems with your split algorithm or row printing. All your APIs
179      * will be passed firstRow() instead of empty array.
180      *
181      * @return your representation of your first row
182      */
183     byte[] firstRow();
184 
185     /**
186      * In HBase, the last row is represented by an empty byte array. This might
187      * cause problems with your split algorithm or row printing. All your APIs
188      * will be passed firstRow() instead of empty array.
189      *
190      * @return your representation of your last row
191      */
192     byte[] lastRow();
193 
194     /**
195      * In HBase, the last row is represented by an empty byte array. Set this
196      * value to help the split code understand how to evenly divide the first
197      * region.
198      *
199      * @param userInput
200      *          raw user input (may throw RuntimeException on parse failure)
201      */
202     void setFirstRow(String userInput);
203 
204     /**
205      * In HBase, the last row is represented by an empty byte array. Set this
206      * value to help the split code understand how to evenly divide the last
207      * region. Note that this last row is inclusive for all rows sharing the
208      * same prefix.
209      *
210      * @param userInput
211      *          raw user input (may throw RuntimeException on parse failure)
212      */
213     void setLastRow(String userInput);
214 
215     /**
216      * @param input
217      *          user or file input for row
218      * @return byte array representation of this row for HBase
219      */
220     byte[] strToRow(String input);
221 
222     /**
223      * @param row
224      *          byte array representing a row in HBase
225      * @return String to use for debug & file printing
226      */
227     String rowToStr(byte[] row);
228 
229     /**
230      * @return the separator character to use when storing / printing the row
231      */
232     String separator();
233 
234     /**
235      * Set the first row
236      * @param userInput byte array of the row key.
237      */
238     void setFirstRow(byte[] userInput);
239 
240     /**
241      * Set the last row
242      * @param userInput byte array of the row key.
243      */
244     void setLastRow(byte[] userInput);
245   }
246 
247   /**
248    * The main function for the RegionSplitter application. Common uses:
249    * <p>
250    * <ul>
251    * <li>create a table named 'myTable' with 60 pre-split regions containing 2
252    * column families 'test' & 'rs', assuming the keys are hex-encoded ASCII:
253    * <ul>
254    * <li>bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -c 60 -f test:rs
255    * myTable HexStringSplit
256    * </ul>
257    * <li>perform a rolling split of 'myTable' (i.e. 60 => 120 regions), # 2
258    * outstanding splits at a time, assuming keys are uniformly distributed
259    * bytes:
260    * <ul>
261    * <li>bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -r -o 2 myTable
262    * UniformSplit
263    * </ul>
264    * </ul>
265    *
266    * There are two SplitAlgorithms built into RegionSplitter, HexStringSplit
267    * and UniformSplit. These are different strategies for choosing region
268    * boundaries. See their source code for details.
269    *
270    * @param args
271    *          Usage: RegionSplitter &lt;TABLE&gt; &lt;SPLITALGORITHM&gt;
272    *          &lt;-c &lt;# regions&gt; -f &lt;family:family:...&gt; | -r
273    *          [-o &lt;# outstanding splits&gt;]&gt;
274    *          [-D &lt;conf.param=value&gt;]
275    * @throws IOException
276    *           HBase IO problem
277    * @throws InterruptedException
278    *           user requested exit
279    * @throws ParseException
280    *           problem parsing user input
281    */
282   @SuppressWarnings("static-access")
283   public static void main(String[] args) throws IOException,
284       InterruptedException, ParseException {
285     Configuration conf = HBaseConfiguration.create();
286 
287     // parse user input
288     Options opt = new Options();
289     opt.addOption(OptionBuilder.withArgName("property=value").hasArg()
290         .withDescription("Override HBase Configuration Settings").create("D"));
291     opt.addOption(OptionBuilder.withArgName("region count").hasArg()
292         .withDescription(
293             "Create a new table with a pre-split number of regions")
294         .create("c"));
295     opt.addOption(OptionBuilder.withArgName("family:family:...").hasArg()
296         .withDescription(
297             "Column Families to create with new table.  Required with -c")
298         .create("f"));
299     opt.addOption("h", false, "Print this usage help");
300     opt.addOption("r", false, "Perform a rolling split of an existing region");
301     opt.addOption(OptionBuilder.withArgName("count").hasArg().withDescription(
302         "Max outstanding splits that have unfinished major compactions")
303         .create("o"));
304     opt.addOption(null, "firstrow", true,
305         "First Row in Table for Split Algorithm");
306     opt.addOption(null, "lastrow", true,
307         "Last Row in Table for Split Algorithm");
308     opt.addOption(null, "risky", false,
309         "Skip verification steps to complete quickly."
310             + "STRONGLY DISCOURAGED for production systems.  ");
311     CommandLine cmd = new GnuParser().parse(opt, args);
312 
313     if (cmd.hasOption("D")) {
314       for (String confOpt : cmd.getOptionValues("D")) {
315         String[] kv = confOpt.split("=", 2);
316         if (kv.length == 2) {
317           conf.set(kv[0], kv[1]);
318           LOG.debug("-D configuration override: " + kv[0] + "=" + kv[1]);
319         } else {
320           throw new ParseException("-D option format invalid: " + confOpt);
321         }
322       }
323     }
324 
325     if (cmd.hasOption("risky")) {
326       conf.setBoolean("split.verify", false);
327     }
328 
329     boolean createTable = cmd.hasOption("c") && cmd.hasOption("f");
330     boolean rollingSplit = cmd.hasOption("r");
331     boolean oneOperOnly = createTable ^ rollingSplit;
332 
333     if (2 != cmd.getArgList().size() || !oneOperOnly || cmd.hasOption("h")) {
334       new HelpFormatter().printHelp("RegionSplitter <TABLE> <SPLITALGORITHM>\n"+
335 		  "SPLITALGORITHM is a java class name of a class implementing " +
336 		  "SplitAlgorithm, or one of the special strings HexStringSplit " +
337 		  "or UniformSplit, which are built-in split algorithms. " +
338 		  "HexStringSplit treats keys as hexadecimal ASCII, and " +
339 		  "UniformSplit treats keys as arbitrary bytes.", opt);
340       return;
341     }
342     TableName tableName = TableName.valueOf(cmd.getArgs()[0]);
343     String splitClass = cmd.getArgs()[1];
344     SplitAlgorithm splitAlgo = newSplitAlgoInstance(conf, splitClass);
345 
346     if (cmd.hasOption("firstrow")) {
347       splitAlgo.setFirstRow(cmd.getOptionValue("firstrow"));
348     }
349     if (cmd.hasOption("lastrow")) {
350       splitAlgo.setLastRow(cmd.getOptionValue("lastrow"));
351     }
352 
353     if (createTable) {
354       conf.set("split.count", cmd.getOptionValue("c"));
355       createPresplitTable(tableName, splitAlgo, cmd.getOptionValue("f").split(":"), conf);
356     }
357 
358     if (rollingSplit) {
359       if (cmd.hasOption("o")) {
360         conf.set("split.outstanding", cmd.getOptionValue("o"));
361       }
362       rollingSplit(tableName, splitAlgo, conf);
363     }
364   }
365 
366   static void createPresplitTable(TableName tableName, SplitAlgorithm splitAlgo,
367           String[] columnFamilies, Configuration conf) throws IOException,
368           InterruptedException {
369     final int splitCount = conf.getInt("split.count", 0);
370     Preconditions.checkArgument(splitCount > 1, "Split count must be > 1");
371 
372     Preconditions.checkArgument(columnFamilies.length > 0,
373         "Must specify at least one column family. ");
374     LOG.debug("Creating table " + tableName + " with " + columnFamilies.length
375         + " column families.  Presplitting to " + splitCount + " regions");
376 
377     HTableDescriptor desc = new HTableDescriptor(tableName);
378     for (String cf : columnFamilies) {
379       desc.addFamily(new HColumnDescriptor(Bytes.toBytes(cf)));
380     }
381     HBaseAdmin admin = new HBaseAdmin(conf);
382     try {
383       Preconditions.checkArgument(!admin.tableExists(tableName),
384         "Table already exists: " + tableName);
385       admin.createTable(desc, splitAlgo.split(splitCount));
386     } finally {
387       admin.close();
388     }
389     LOG.debug("Table created!  Waiting for regions to show online in META...");
390     if (!conf.getBoolean("split.verify", true)) {
391       // NOTE: createTable is synchronous on the table, but not on the regions
392       int onlineRegions = 0;
393       while (onlineRegions < splitCount) {
394         onlineRegions = MetaTableAccessor.getRegionCount(conf, tableName);
395         LOG.debug(onlineRegions + " of " + splitCount + " regions online...");
396         if (onlineRegions < splitCount) {
397           Thread.sleep(10 * 1000); // sleep
398         }
399       }
400     }
401 
402     LOG.debug("Finished creating table with " + splitCount + " regions");
403   }
404 
405   static void rollingSplit(TableName tableName, SplitAlgorithm splitAlgo,
406           Configuration conf) throws IOException, InterruptedException {
407     final int minOS = conf.getInt("split.outstanding", 2);
408 
409     HTable table = new HTable(conf, tableName);
410 
411     // max outstanding splits. default == 50% of servers
412     final int MAX_OUTSTANDING =
413         Math.max(table.getConnection().getCurrentNrHRS() / 2, minOS);
414 
415     Path hbDir = FSUtils.getRootDir(conf);
416     Path tableDir = FSUtils.getTableDir(hbDir, table.getName());
417     Path splitFile = new Path(tableDir, "_balancedSplit");
418     FileSystem fs = FileSystem.get(conf);
419 
420     // get a list of daughter regions to create
421     LinkedList<Pair<byte[], byte[]>> tmpRegionSet = getSplits(table, splitAlgo);
422     LinkedList<Pair<byte[], byte[]>> outstanding = Lists.newLinkedList();
423     int splitCount = 0;
424     final int origCount = tmpRegionSet.size();
425 
426     // all splits must compact & we have 1 compact thread, so 2 split
427     // requests to the same RS can stall the outstanding split queue.
428     // To fix, group the regions into an RS pool and round-robin through it
429     LOG.debug("Bucketing regions by regionserver...");
430     TreeMap<String, LinkedList<Pair<byte[], byte[]>>> daughterRegions =
431       Maps.newTreeMap();
432     for (Pair<byte[], byte[]> dr : tmpRegionSet) {
433       String rsLocation = table.getRegionLocation(dr.getSecond()).
434         getHostnamePort();
435       if (!daughterRegions.containsKey(rsLocation)) {
436         LinkedList<Pair<byte[], byte[]>> entry = Lists.newLinkedList();
437         daughterRegions.put(rsLocation, entry);
438       }
439       daughterRegions.get(rsLocation).add(dr);
440     }
441     LOG.debug("Done with bucketing.  Split time!");
442     long startTime = System.currentTimeMillis();
443 
444     // open the split file and modify it as splits finish
445     FSDataInputStream tmpIn = fs.open(splitFile);
446     byte[] rawData = new byte[tmpIn.available()];
447     tmpIn.readFully(rawData);
448     tmpIn.close();
449     FSDataOutputStream splitOut = fs.create(splitFile);
450     splitOut.write(rawData);
451 
452     try {
453       // *** split code ***
454       while (!daughterRegions.isEmpty()) {
455         LOG.debug(daughterRegions.size() + " RS have regions to splt.");
456 
457         // Get RegionServer : region count mapping
458         final TreeMap<ServerName, Integer> rsSizes = Maps.newTreeMap();
459         Map<HRegionInfo, ServerName> regionsInfo = table.getRegionLocations();
460         for (ServerName rs : regionsInfo.values()) {
461           if (rsSizes.containsKey(rs)) {
462             rsSizes.put(rs, rsSizes.get(rs) + 1);
463           } else {
464             rsSizes.put(rs, 1);
465           }
466         }
467 
468         // sort the RS by the number of regions they have
469         List<String> serversLeft = Lists.newArrayList(daughterRegions .keySet());
470         Collections.sort(serversLeft, new Comparator<String>() {
471           public int compare(String o1, String o2) {
472             return rsSizes.get(o1).compareTo(rsSizes.get(o2));
473           }
474         });
475 
476         // round-robin through the RS list. Choose the lightest-loaded servers
477         // first to keep the master from load-balancing regions as we split.
478         for (String rsLoc : serversLeft) {
479           Pair<byte[], byte[]> dr = null;
480 
481           // find a region in the RS list that hasn't been moved
482           LOG.debug("Finding a region on " + rsLoc);
483           LinkedList<Pair<byte[], byte[]>> regionList = daughterRegions
484               .get(rsLoc);
485           while (!regionList.isEmpty()) {
486             dr = regionList.pop();
487 
488             // get current region info
489             byte[] split = dr.getSecond();
490             HRegionLocation regionLoc = table.getRegionLocation(split);
491 
492             // if this region moved locations
493             String newRs = regionLoc.getHostnamePort();
494             if (newRs.compareTo(rsLoc) != 0) {
495               LOG.debug("Region with " + splitAlgo.rowToStr(split)
496                   + " moved to " + newRs + ". Relocating...");
497               // relocate it, don't use it right now
498               if (!daughterRegions.containsKey(newRs)) {
499                 LinkedList<Pair<byte[], byte[]>> entry = Lists.newLinkedList();
500                 daughterRegions.put(newRs, entry);
501               }
502               daughterRegions.get(newRs).add(dr);
503               dr = null;
504               continue;
505             }
506 
507             // make sure this region wasn't already split
508             byte[] sk = regionLoc.getRegionInfo().getStartKey();
509             if (sk.length != 0) {
510               if (Bytes.equals(split, sk)) {
511                 LOG.debug("Region already split on "
512                     + splitAlgo.rowToStr(split) + ".  Skipping this region...");
513                 ++splitCount;
514                 dr = null;
515                 continue;
516               }
517               byte[] start = dr.getFirst();
518               Preconditions.checkArgument(Bytes.equals(start, sk), splitAlgo
519                   .rowToStr(start) + " != " + splitAlgo.rowToStr(sk));
520             }
521 
522             // passed all checks! found a good region
523             break;
524           }
525           if (regionList.isEmpty()) {
526             daughterRegions.remove(rsLoc);
527           }
528           if (dr == null)
529             continue;
530 
531           // we have a good region, time to split!
532           byte[] split = dr.getSecond();
533           LOG.debug("Splitting at " + splitAlgo.rowToStr(split));
534           HBaseAdmin admin = new HBaseAdmin(table.getConfiguration());
535           try {
536             admin.split(table.getTableName(), split);
537           } finally {
538             admin.close();
539           }
540 
541           LinkedList<Pair<byte[], byte[]>> finished = Lists.newLinkedList();
542           LinkedList<Pair<byte[], byte[]>> local_finished = Lists.newLinkedList();
543           if (conf.getBoolean("split.verify", true)) {
544             // we need to verify and rate-limit our splits
545             outstanding.addLast(dr);
546             // with too many outstanding splits, wait for some to finish
547             while (outstanding.size() >= MAX_OUTSTANDING) {
548               LOG.debug("Wait for outstanding splits " + outstanding.size());
549               local_finished = splitScan(outstanding, table, splitAlgo);
550               if (local_finished.isEmpty()) {
551                 Thread.sleep(30 * 1000);
552               } else {
553                 finished.addAll(local_finished);
554                 outstanding.removeAll(local_finished);
555                 LOG.debug(local_finished.size() + " outstanding splits finished");
556               }
557             }
558           } else {
559             finished.add(dr);
560           }
561 
562           // mark each finished region as successfully split.
563           for (Pair<byte[], byte[]> region : finished) {
564             splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst())
565                 + " " + splitAlgo.rowToStr(region.getSecond()) + "\n");
566             splitCount++;
567             if (splitCount % 10 == 0) {
568               long tDiff = (System.currentTimeMillis() - startTime)
569                   / splitCount;
570               LOG.debug("STATUS UPDATE: " + splitCount + " / " + origCount
571                   + ". Avg Time / Split = "
572                   + org.apache.hadoop.util.StringUtils.formatTime(tDiff));
573             }
574           }
575         }
576       }
577       if (conf.getBoolean("split.verify", true)) {
578         while (!outstanding.isEmpty()) {
579           LOG.debug("Finally Wait for outstanding splits " + outstanding.size());
580           LinkedList<Pair<byte[], byte[]>> finished = splitScan(outstanding,
581               table, splitAlgo);
582           if (finished.isEmpty()) {
583             Thread.sleep(30 * 1000);
584           } else {
585             outstanding.removeAll(finished);
586             for (Pair<byte[], byte[]> region : finished) {
587               splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst())
588                   + " " + splitAlgo.rowToStr(region.getSecond()) + "\n");
589               splitCount++;
590             }
591             LOG.debug("Finally " + finished.size() + " outstanding splits finished");
592           }
593         }
594       }
595       LOG.debug("All regions have been successfully split!");
596     } finally {
597       long tDiff = System.currentTimeMillis() - startTime;
598       LOG.debug("TOTAL TIME = "
599           + org.apache.hadoop.util.StringUtils.formatTime(tDiff));
600       LOG.debug("Splits = " + splitCount);
601       if (0 < splitCount) {
602         LOG.debug("Avg Time / Split = "
603             + org.apache.hadoop.util.StringUtils.formatTime(tDiff / splitCount));
604       }
605 
606       splitOut.close();
607       if (table != null){
608         table.close();
609       }
610     }
611     fs.delete(splitFile, false);
612   }
613 
614   /**
615    * @throws IOException if the specified SplitAlgorithm class couldn't be
616    * instantiated
617    */
618   public static SplitAlgorithm newSplitAlgoInstance(Configuration conf,
619           String splitClassName) throws IOException {
620     Class<?> splitClass;
621 
622     // For split algorithms builtin to RegionSplitter, the user can specify
623     // their simple class name instead of a fully qualified class name.
624     if(splitClassName.equals(HexStringSplit.class.getSimpleName())) {
625       splitClass = HexStringSplit.class;
626     } else if (splitClassName.equals(UniformSplit.class.getSimpleName())) {
627       splitClass = UniformSplit.class;
628     } else {
629       try {
630         splitClass = conf.getClassByName(splitClassName);
631       } catch (ClassNotFoundException e) {
632         throw new IOException("Couldn't load split class " + splitClassName, e);
633       }
634       if(splitClass == null) {
635         throw new IOException("Failed loading split class " + splitClassName);
636       }
637       if(!SplitAlgorithm.class.isAssignableFrom(splitClass)) {
638         throw new IOException(
639                 "Specified split class doesn't implement SplitAlgorithm");
640       }
641     }
642     try {
643       return splitClass.asSubclass(SplitAlgorithm.class).newInstance();
644     } catch (Exception e) {
645       throw new IOException("Problem loading split algorithm: ", e);
646     }
647   }
648 
649   static LinkedList<Pair<byte[], byte[]>> splitScan(
650       LinkedList<Pair<byte[], byte[]>> regionList, HTable table,
651       SplitAlgorithm splitAlgo)
652       throws IOException, InterruptedException {
653     LinkedList<Pair<byte[], byte[]>> finished = Lists.newLinkedList();
654     LinkedList<Pair<byte[], byte[]>> logicalSplitting = Lists.newLinkedList();
655     LinkedList<Pair<byte[], byte[]>> physicalSplitting = Lists.newLinkedList();
656 
657     // get table info
658     Path rootDir = FSUtils.getRootDir(table.getConfiguration());
659     Path tableDir = FSUtils.getTableDir(rootDir, table.getName());
660     FileSystem fs = tableDir.getFileSystem(table.getConfiguration());
661     HTableDescriptor htd = table.getTableDescriptor();
662 
663     // clear the cache to forcibly refresh region information
664     table.clearRegionCache();
665 
666     // for every region that hasn't been verified as a finished split
667     for (Pair<byte[], byte[]> region : regionList) {
668       byte[] start = region.getFirst();
669       byte[] split = region.getSecond();
670 
671       // see if the new split daughter region has come online
672       try {
673         HRegionInfo dri = table.getRegionLocation(split).getRegionInfo();
674         if (dri.isOffline() || !Bytes.equals(dri.getStartKey(), split)) {
675           logicalSplitting.add(region);
676           continue;
677         }
678       } catch (NoServerForRegionException nsfre) {
679         // NSFRE will occur if the old hbase:meta entry has no server assigned
680         LOG.info(nsfre);
681         logicalSplitting.add(region);
682         continue;
683       }
684 
685       try {
686         // when a daughter region is opened, a compaction is triggered
687         // wait until compaction completes for both daughter regions
688         LinkedList<HRegionInfo> check = Lists.newLinkedList();
689         check.add(table.getRegionLocation(start).getRegionInfo());
690         check.add(table.getRegionLocation(split).getRegionInfo());
691         for (HRegionInfo hri : check.toArray(new HRegionInfo[check.size()])) {
692           byte[] sk = hri.getStartKey();
693           if (sk.length == 0)
694             sk = splitAlgo.firstRow();
695           String startKey = splitAlgo.rowToStr(sk);
696 
697           HRegionFileSystem regionFs = HRegionFileSystem.openRegionFromFileSystem(
698               table.getConfiguration(), fs, tableDir, hri, true);
699 
700           // check every Column Family for that region
701           boolean refFound = false;
702           for (HColumnDescriptor c : htd.getFamilies()) {
703             if ((refFound = regionFs.hasReferences(htd.getTableName().getNameAsString()))) {
704               break;
705             }
706           }
707 
708           // compaction is completed when all reference files are gone
709           if (!refFound) {
710             check.remove(hri);
711           }
712         }
713         if (check.isEmpty()) {
714           finished.add(region);
715         } else {
716           physicalSplitting.add(region);
717         }
718       } catch (NoServerForRegionException nsfre) {
719         LOG.debug("No Server Exception thrown for: " + splitAlgo.rowToStr(start));
720         physicalSplitting.add(region);
721         table.clearRegionCache();
722       }
723     }
724 
725     LOG.debug("Split Scan: " + finished.size() + " finished / "
726         + logicalSplitting.size() + " split wait / "
727         + physicalSplitting.size() + " reference wait");
728 
729     return finished;
730   }
731 
732   static LinkedList<Pair<byte[], byte[]>> getSplits(HTable table,
733       SplitAlgorithm splitAlgo) throws IOException {
734     Path hbDir = FSUtils.getRootDir(table.getConfiguration());
735     Path tableDir = FSUtils.getTableDir(hbDir, table.getName());
736     Path splitFile = new Path(tableDir, "_balancedSplit");
737     FileSystem fs = tableDir.getFileSystem(table.getConfiguration());
738 
739     // using strings because (new byte[]{0}).equals(new byte[]{0}) == false
740     Set<Pair<String, String>> daughterRegions = Sets.newHashSet();
741 
742     // does a split file exist?
743     if (!fs.exists(splitFile)) {
744       // NO = fresh start. calculate splits to make
745       LOG.debug("No _balancedSplit file.  Calculating splits...");
746 
747       // query meta for all regions in the table
748       Set<Pair<byte[], byte[]>> rows = Sets.newHashSet();
749       Pair<byte[][], byte[][]> tmp = table.getStartEndKeys();
750       Preconditions.checkArgument(
751           tmp.getFirst().length == tmp.getSecond().length,
752           "Start and End rows should be equivalent");
753       for (int i = 0; i < tmp.getFirst().length; ++i) {
754         byte[] start = tmp.getFirst()[i], end = tmp.getSecond()[i];
755         if (start.length == 0)
756           start = splitAlgo.firstRow();
757         if (end.length == 0)
758           end = splitAlgo.lastRow();
759         rows.add(Pair.newPair(start, end));
760       }
761       LOG.debug("Table " + Bytes.toString(table.getTableName()) + " has "
762           + rows.size() + " regions that will be split.");
763 
764       // prepare the split file
765       Path tmpFile = new Path(tableDir, "_balancedSplit_prepare");
766       FSDataOutputStream tmpOut = fs.create(tmpFile);
767 
768       // calculate all the splits == [daughterRegions] = [(start, splitPoint)]
769       for (Pair<byte[], byte[]> r : rows) {
770         byte[] splitPoint = splitAlgo.split(r.getFirst(), r.getSecond());
771         String startStr = splitAlgo.rowToStr(r.getFirst());
772         String splitStr = splitAlgo.rowToStr(splitPoint);
773         daughterRegions.add(Pair.newPair(startStr, splitStr));
774         LOG.debug("Will Split [" + startStr + " , "
775             + splitAlgo.rowToStr(r.getSecond()) + ") at " + splitStr);
776         tmpOut.writeChars("+ " + startStr + splitAlgo.separator() + splitStr
777             + "\n");
778       }
779       tmpOut.close();
780       fs.rename(tmpFile, splitFile);
781     } else {
782       LOG.debug("_balancedSplit file found. Replay log to restore state...");
783       FSUtils.getInstance(fs, table.getConfiguration())
784         .recoverFileLease(fs, splitFile, table.getConfiguration(), null);
785 
786       // parse split file and process remaining splits
787       FSDataInputStream tmpIn = fs.open(splitFile);
788       StringBuilder sb = new StringBuilder(tmpIn.available());
789       while (tmpIn.available() > 0) {
790         sb.append(tmpIn.readChar());
791       }
792       tmpIn.close();
793       for (String line : sb.toString().split("\n")) {
794         String[] cmd = line.split(splitAlgo.separator());
795         Preconditions.checkArgument(3 == cmd.length);
796         byte[] start = splitAlgo.strToRow(cmd[1]);
797         String startStr = splitAlgo.rowToStr(start);
798         byte[] splitPoint = splitAlgo.strToRow(cmd[2]);
799         String splitStr = splitAlgo.rowToStr(splitPoint);
800         Pair<String, String> r = Pair.newPair(startStr, splitStr);
801         if (cmd[0].equals("+")) {
802           LOG.debug("Adding: " + r);
803           daughterRegions.add(r);
804         } else {
805           LOG.debug("Removing: " + r);
806           Preconditions.checkArgument(cmd[0].equals("-"),
807               "Unknown option: " + cmd[0]);
808           Preconditions.checkState(daughterRegions.contains(r),
809               "Missing row: " + r);
810           daughterRegions.remove(r);
811         }
812       }
813       LOG.debug("Done reading. " + daughterRegions.size() + " regions left.");
814     }
815     LinkedList<Pair<byte[], byte[]>> ret = Lists.newLinkedList();
816     for (Pair<String, String> r : daughterRegions) {
817       ret.add(Pair.newPair(splitAlgo.strToRow(r.getFirst()), splitAlgo
818           .strToRow(r.getSecond())));
819     }
820     return ret;
821   }
822 
823   /**
824    * HexStringSplit is a well-known {@link SplitAlgorithm} for choosing region
825    * boundaries. The format of a HexStringSplit region boundary is the ASCII
826    * representation of an MD5 checksum, or any other uniformly distributed
827    * hexadecimal value. Row are hex-encoded long values in the range
828    * <b>"00000000" => "FFFFFFFF"</b> and are left-padded with zeros to keep the
829    * same order lexicographically as if they were binary.
830    *
831    * Since this split algorithm uses hex strings as keys, it is easy to read &
832    * write in the shell but takes up more space and may be non-intuitive.
833    */
834   public static class HexStringSplit implements SplitAlgorithm {
835     final static String DEFAULT_MIN_HEX = "00000000";
836     final static String DEFAULT_MAX_HEX = "FFFFFFFF";
837 
838     String firstRow = DEFAULT_MIN_HEX;
839     BigInteger firstRowInt = BigInteger.ZERO;
840     String lastRow = DEFAULT_MAX_HEX;
841     BigInteger lastRowInt = new BigInteger(lastRow, 16);
842     int rowComparisonLength = lastRow.length();
843 
844     public byte[] split(byte[] start, byte[] end) {
845       BigInteger s = convertToBigInteger(start);
846       BigInteger e = convertToBigInteger(end);
847       Preconditions.checkArgument(!e.equals(BigInteger.ZERO));
848       return convertToByte(split2(s, e));
849     }
850 
851     public byte[][] split(int n) {
852       Preconditions.checkArgument(lastRowInt.compareTo(firstRowInt) > 0,
853           "last row (%s) is configured less than first row (%s)", lastRow,
854           firstRow);
855       // +1 to range because the last row is inclusive
856       BigInteger range = lastRowInt.subtract(firstRowInt).add(BigInteger.ONE);
857       Preconditions.checkState(range.compareTo(BigInteger.valueOf(n)) >= 0,
858           "split granularity (%s) is greater than the range (%s)", n, range);
859 
860       BigInteger[] splits = new BigInteger[n - 1];
861       BigInteger sizeOfEachSplit = range.divide(BigInteger.valueOf(n));
862       for (int i = 1; i < n; i++) {
863         // NOTE: this means the last region gets all the slop.
864         // This is not a big deal if we're assuming n << MAXHEX
865         splits[i - 1] = firstRowInt.add(sizeOfEachSplit.multiply(BigInteger
866             .valueOf(i)));
867       }
868       return convertToBytes(splits);
869     }
870 
871     public byte[] firstRow() {
872       return convertToByte(firstRowInt);
873     }
874 
875     public byte[] lastRow() {
876       return convertToByte(lastRowInt);
877     }
878 
879     public void setFirstRow(String userInput) {
880       firstRow = userInput;
881       firstRowInt = new BigInteger(firstRow, 16);
882     }
883 
884     public void setLastRow(String userInput) {
885       lastRow = userInput;
886       lastRowInt = new BigInteger(lastRow, 16);
887       // Precondition: lastRow > firstRow, so last's length is the greater
888       rowComparisonLength = lastRow.length();
889     }
890 
891     public byte[] strToRow(String in) {
892       return convertToByte(new BigInteger(in, 16));
893     }
894 
895     public String rowToStr(byte[] row) {
896       return Bytes.toStringBinary(row);
897     }
898 
899     public String separator() {
900       return " ";
901     }
902 
903     @Override
904     public void setFirstRow(byte[] userInput) {
905       firstRow = Bytes.toString(userInput);
906     }
907 
908     @Override
909     public void setLastRow(byte[] userInput) {
910       lastRow = Bytes.toString(userInput);
911     }
912 
913     /**
914      * Divide 2 numbers in half (for split algorithm)
915      *
916      * @param a number #1
917      * @param b number #2
918      * @return the midpoint of the 2 numbers
919      */
920     public BigInteger split2(BigInteger a, BigInteger b) {
921       return a.add(b).divide(BigInteger.valueOf(2)).abs();
922     }
923 
924     /**
925      * Returns an array of bytes corresponding to an array of BigIntegers
926      *
927      * @param bigIntegers numbers to convert
928      * @return bytes corresponding to the bigIntegers
929      */
930     public byte[][] convertToBytes(BigInteger[] bigIntegers) {
931       byte[][] returnBytes = new byte[bigIntegers.length][];
932       for (int i = 0; i < bigIntegers.length; i++) {
933         returnBytes[i] = convertToByte(bigIntegers[i]);
934       }
935       return returnBytes;
936     }
937 
938     /**
939      * Returns the bytes corresponding to the BigInteger
940      *
941      * @param bigInteger number to convert
942      * @param pad padding length
943      * @return byte corresponding to input BigInteger
944      */
945     public static byte[] convertToByte(BigInteger bigInteger, int pad) {
946       String bigIntegerString = bigInteger.toString(16);
947       bigIntegerString = StringUtils.leftPad(bigIntegerString, pad, '0');
948       return Bytes.toBytes(bigIntegerString);
949     }
950 
951     /**
952      * Returns the bytes corresponding to the BigInteger
953      *
954      * @param bigInteger number to convert
955      * @return corresponding bytes
956      */
957     public byte[] convertToByte(BigInteger bigInteger) {
958       return convertToByte(bigInteger, rowComparisonLength);
959     }
960 
961     /**
962      * Returns the BigInteger represented by the byte array
963      *
964      * @param row byte array representing row
965      * @return the corresponding BigInteger
966      */
967     public BigInteger convertToBigInteger(byte[] row) {
968       return (row.length > 0) ? new BigInteger(Bytes.toString(row), 16)
969           : BigInteger.ZERO;
970     }
971 
972     @Override
973     public String toString() {
974       return this.getClass().getSimpleName() + " [" + rowToStr(firstRow())
975           + "," + rowToStr(lastRow()) + "]";
976     }
977   }
978 
979   /**
980    * A SplitAlgorithm that divides the space of possible keys evenly. Useful
981    * when the keys are approximately uniform random bytes (e.g. hashes). Rows
982    * are raw byte values in the range <b>00 => FF</b> and are right-padded with
983    * zeros to keep the same memcmp() order. This is the natural algorithm to use
984    * for a byte[] environment and saves space, but is not necessarily the
985    * easiest for readability.
986    */
987   public static class UniformSplit implements SplitAlgorithm {
988     static final byte xFF = (byte) 0xFF;
989     byte[] firstRowBytes = ArrayUtils.EMPTY_BYTE_ARRAY;
990     byte[] lastRowBytes =
991             new byte[] {xFF, xFF, xFF, xFF, xFF, xFF, xFF, xFF};
992     public byte[] split(byte[] start, byte[] end) {
993       return Bytes.split(start, end, 1)[1];
994     }
995 
996     @Override
997     public byte[][] split(int numRegions) {
998       Preconditions.checkArgument(
999           Bytes.compareTo(lastRowBytes, firstRowBytes) > 0,
1000           "last row (%s) is configured less than first row (%s)",
1001           Bytes.toStringBinary(lastRowBytes),
1002           Bytes.toStringBinary(firstRowBytes));
1003 
1004       byte[][] splits = Bytes.split(firstRowBytes, lastRowBytes, true,
1005           numRegions - 1);
1006       Preconditions.checkState(splits != null,
1007           "Could not split region with given user input: " + this);
1008 
1009       // remove endpoints, which are included in the splits list
1010       return Arrays.copyOfRange(splits, 1, splits.length - 1);
1011     }
1012 
1013     @Override
1014     public byte[] firstRow() {
1015       return firstRowBytes;
1016     }
1017 
1018     @Override
1019     public byte[] lastRow() {
1020       return lastRowBytes;
1021     }
1022 
1023     @Override
1024     public void setFirstRow(String userInput) {
1025       firstRowBytes = Bytes.toBytesBinary(userInput);
1026     }
1027 
1028     @Override
1029     public void setLastRow(String userInput) {
1030       lastRowBytes = Bytes.toBytesBinary(userInput);
1031     }
1032 
1033 
1034     @Override
1035     public void setFirstRow(byte[] userInput) {
1036       firstRowBytes = userInput;
1037     }
1038 
1039     @Override
1040     public void setLastRow(byte[] userInput) {
1041       lastRowBytes = userInput;
1042     }
1043 
1044     @Override
1045     public byte[] strToRow(String input) {
1046       return Bytes.toBytesBinary(input);
1047     }
1048 
1049     @Override
1050     public String rowToStr(byte[] row) {
1051       return Bytes.toStringBinary(row);
1052     }
1053 
1054     @Override
1055     public String separator() {
1056       return ",";
1057     }
1058 
1059     @Override
1060     public String toString() {
1061       return this.getClass().getSimpleName() + " [" + rowToStr(firstRow())
1062           + "," + rowToStr(lastRow()) + "]";
1063     }
1064   }
1065 }