001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.util;
020
021import java.util.ArrayList;
022import java.util.Collection;
023import java.util.Comparator;
024import java.util.List;
025import java.util.Map.Entry;
026import java.util.TreeMap;
027import java.util.TreeSet;
028
029import org.apache.yetus.audience.InterfaceAudience;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
033
034import org.apache.hbase.thirdparty.com.google.common.collect.ArrayListMultimap;
035import org.apache.hbase.thirdparty.com.google.common.collect.Multimap;
036import org.apache.hbase.thirdparty.com.google.common.collect.TreeMultimap;
037
038/**
039 * This is a generic region split calculator. It requires Ranges that provide
040 * start, end, and a comparator. It works in two phases -- the first adds ranges
041 * and rejects backwards ranges. Then one calls calcRegions to generate the
042 * multimap that has a start split key as a key and possibly multiple Ranges as
043 * members.
044 * 
045 * To traverse, one normally would get the split set, and iterate through the
046 * calcRegions. Normal regions would have only one entry, holes would have zero,
047 * and any overlaps would have multiple entries.
048 * 
049 * The interface is a bit cumbersome currently but is exposed this way so that
050 * clients can choose how to iterate through the region splits.
051 * 
052 * @param <R>
053 */
054@InterfaceAudience.Private
055public class RegionSplitCalculator<R extends KeyRange> {
056  private static final Logger LOG = LoggerFactory.getLogger(RegionSplitCalculator.class);
057
058  private final Comparator<R> rangeCmp;
059  /**
060   * This contains a sorted set of all the possible split points
061   * 
062   * Invariant: once populated this has 0 entries if empty or at most n+1 values
063   * where n == number of added ranges.
064   */
065  private final TreeSet<byte[]> splits = new TreeSet<>(BYTES_COMPARATOR);
066
067  /**
068   * This is a map from start key to regions with the same start key.
069   * 
070   * Invariant: This always have n values in total
071   */
072  private final Multimap<byte[], R> starts = ArrayListMultimap.create();
073
074  /**
075   * SPECIAL CASE
076   */
077  private final static byte[] ENDKEY = null;
078
079  public RegionSplitCalculator(Comparator<R> cmp) {
080    rangeCmp = cmp;
081  }
082
083  public final static Comparator<byte[]> BYTES_COMPARATOR = new ByteArrayComparator() {
084    @Override
085    public int compare(byte[] l, byte[] r) {
086      if (l == null && r == null)
087        return 0;
088      if (l == null)
089        return 1;
090      if (r == null)
091        return -1;
092      return super.compare(l, r);
093    }
094  };
095
096  /**
097   * SPECIAL CASE wrapper for empty end key
098   * 
099   * @return ENDKEY if end key is empty, else normal endkey.
100   */
101  private static <R extends KeyRange> byte[] specialEndKey(R range) {
102    byte[] end = range.getEndKey();
103    if (end.length == 0) {
104      return ENDKEY;
105    }
106    return end;
107  }
108
109  /**
110   * Adds an edge to the split calculator
111   * 
112   * @return true if is included, false if backwards/invalid
113   */
114  public boolean add(R range) {
115    byte[] start = range.getStartKey();
116    byte[] end = specialEndKey(range);
117
118    // No need to use Arrays.equals because ENDKEY is null
119    if (end != ENDKEY && Bytes.compareTo(start, end) > 0) {
120      // don't allow backwards edges
121      LOG.debug("attempted to add backwards edge: " + Bytes.toString(start)
122          + " " + Bytes.toString(end));
123      return false;
124    }
125
126    splits.add(start);
127    splits.add(end);
128    starts.put(start, range);
129    return true;
130  }
131
132  /**
133   * Generates a coverage multimap from split key to Regions that start with the
134   * split key.
135   * 
136   * @return coverage multimap
137   */
138  public Multimap<byte[], R> calcCoverage() {
139    // This needs to be sorted to force the use of the comparator on the values,
140    // otherwise byte array comparison isn't used
141    Multimap<byte[], R> regions = TreeMultimap.create(BYTES_COMPARATOR,
142        rangeCmp);
143
144    // march through all splits from the start points
145    for (Entry<byte[], Collection<R>> start : starts.asMap().entrySet()) {
146      byte[] key = start.getKey();
147      for (R r : start.getValue()) {
148        regions.put(key, r);
149
150        for (byte[] coveredSplit : splits.subSet(r.getStartKey(),
151            specialEndKey(r))) {
152          regions.put(coveredSplit, r);
153        }
154      }
155    }
156    return regions;
157  }
158
159  public TreeSet<byte[]> getSplits() {
160    return splits;
161  }
162
163  public Multimap<byte[], R> getStarts() {
164    return starts;
165  }
166
167  /**
168   * Find specified number of top ranges in a big overlap group.
169   * It could return less if there are not that many top ranges.
170   * Once these top ranges are excluded, the big overlap group will
171   * be broken into ranges with no overlapping, or smaller overlapped
172   * groups, and most likely some holes.
173   *
174   * @param bigOverlap a list of ranges that overlap with each other
175   * @param count the max number of ranges to find
176   * @return a list of ranges that overlap with most others
177   */
178  public static <R extends KeyRange> List<R>
179      findBigRanges(Collection<R> bigOverlap, int count) {
180    List<R> bigRanges = new ArrayList<>();
181
182    // The key is the count of overlaps,
183    // The value is a list of ranges that have that many overlaps
184    TreeMap<Integer, List<R>> overlapRangeMap = new TreeMap<>();
185    for (R r: bigOverlap) {
186      // Calculates the # of overlaps for each region
187      // and populates rangeOverlapMap
188      byte[] startKey = r.getStartKey();
189      byte[] endKey = specialEndKey(r);
190
191      int overlappedRegions = 0;
192      for (R rr: bigOverlap) {
193        byte[] start = rr.getStartKey();
194        byte[] end = specialEndKey(rr);
195
196        if (BYTES_COMPARATOR.compare(startKey, end) < 0
197            && BYTES_COMPARATOR.compare(endKey, start) > 0) {
198          overlappedRegions++;
199        }
200      }
201
202      // One region always overlaps with itself,
203      // so overlappedRegions should be more than 1
204      // for actual overlaps.
205      if (overlappedRegions > 1) {
206        Integer key = Integer.valueOf(overlappedRegions);
207        List<R> ranges = overlapRangeMap.get(key);
208        if (ranges == null) {
209          ranges = new ArrayList<>();
210          overlapRangeMap.put(key, ranges);
211        }
212        ranges.add(r);
213      }
214    }
215    int toBeAdded = count;
216    for (Integer key: overlapRangeMap.descendingKeySet()) {
217      List<R> chunk = overlapRangeMap.get(key);
218      int chunkSize = chunk.size();
219      if (chunkSize <= toBeAdded) {
220        bigRanges.addAll(chunk);
221        toBeAdded -= chunkSize;
222        if (toBeAdded > 0) continue;
223      } else {
224        // Try to use the middle chunk in case the overlapping is
225        // chained, for example: [a, c), [b, e), [d, g), [f h)...
226        // In such a case, sideline the middle chunk will break
227        // the group efficiently.
228        int start = (chunkSize - toBeAdded)/2;
229        int end = start + toBeAdded;
230        for (int i = start; i < end; i++) {
231          bigRanges.add(chunk.get(i));
232        }
233      }
234      break;
235    }
236    return bigRanges;
237  }
238}