Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import java.util.ArrayList;
021import java.util.Collection;
022import java.util.Comparator;
023import java.util.List;
024import java.util.Map.Entry;
025import java.util.TreeMap;
026import java.util.TreeSet;
027import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
028import org.apache.yetus.audience.InterfaceAudience;
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import org.apache.hbase.thirdparty.com.google.common.collect.ArrayListMultimap;
033import org.apache.hbase.thirdparty.com.google.common.collect.Multimap;
034import org.apache.hbase.thirdparty.com.google.common.collect.TreeMultimap;
035
036/**
037 * This is a generic region split calculator. It requires Ranges that provide start, end, and a
038 * comparator. It works in two phases -- the first adds ranges and rejects backwards ranges. Then
039 * one calls calcRegions to generate the multimap that has a start split key as a key and possibly
040 * multiple Ranges as members. To traverse, one normally would get the split set, and iterate
041 * through the calcRegions. Normal regions would have only one entry, holes would have zero, and any
042 * overlaps would have multiple entries. The interface is a bit cumbersome currently but is exposed
043 * this way so that clients can choose how to iterate through the region splits.
044 * @param <R>
045 */
046@InterfaceAudience.Private
047public class RegionSplitCalculator<R extends KeyRange> {
048  private static final Logger LOG = LoggerFactory.getLogger(RegionSplitCalculator.class);
049
050  private final Comparator<R> rangeCmp;
051  /**
052   * This contains a sorted set of all the possible split points Invariant: once populated this has
053   * 0 entries if empty or at most n+1 values where n == number of added ranges.
054   */
055  private final TreeSet<byte[]> splits = new TreeSet<>(BYTES_COMPARATOR);
056
057  /**
058   * This is a map from start key to regions with the same start key. Invariant: This always have n
059   * values in total
060   */
061  private final Multimap<byte[], R> starts = ArrayListMultimap.create();
062
063  /**
064   * SPECIAL CASE
065   */
066  private final static byte[] ENDKEY = null;
067
068  public RegionSplitCalculator(Comparator<R> cmp) {
069    rangeCmp = cmp;
070  }
071
072  public final static Comparator<byte[]> BYTES_COMPARATOR = new ByteArrayComparator() {
073    @Override
074    public int compare(byte[] l, byte[] r) {
075      if (l == null && r == null) return 0;
076      if (l == null) return 1;
077      if (r == null) return -1;
078      return super.compare(l, r);
079    }
080  };
081
082  /**
083   * SPECIAL CASE wrapper for empty end key
084   * @return ENDKEY if end key is empty, else normal endkey.
085   */
086  private static <R extends KeyRange> byte[] specialEndKey(R range) {
087    byte[] end = range.getEndKey();
088    if (end.length == 0) {
089      return ENDKEY;
090    }
091    return end;
092  }
093
094  /**
095   * Adds an edge to the split calculator
096   * @return true if is included, false if backwards/invalid
097   */
098  public boolean add(R range) {
099    byte[] start = range.getStartKey();
100    byte[] end = specialEndKey(range);
101
102    // No need to use Arrays.equals because ENDKEY is null
103    if (end != ENDKEY && Bytes.compareTo(start, end) > 0) {
104      // don't allow backwards edges
105      LOG.debug(
106        "attempted to add backwards edge: " + Bytes.toString(start) + " " + Bytes.toString(end));
107      return false;
108    }
109
110    splits.add(start);
111    splits.add(end);
112    starts.put(start, range);
113    return true;
114  }
115
116  /**
117   * Generates a coverage multimap from split key to Regions that start with the split key.
118   * @return coverage multimap
119   */
120  public Multimap<byte[], R> calcCoverage() {
121    // This needs to be sorted to force the use of the comparator on the values,
122    // otherwise byte array comparison isn't used
123    Multimap<byte[], R> regions = TreeMultimap.create(BYTES_COMPARATOR, rangeCmp);
124
125    // march through all splits from the start points
126    for (Entry<byte[], Collection<R>> start : starts.asMap().entrySet()) {
127      byte[] key = start.getKey();
128      for (R r : start.getValue()) {
129        regions.put(key, r);
130
131        for (byte[] coveredSplit : splits.subSet(r.getStartKey(), specialEndKey(r))) {
132          regions.put(coveredSplit, r);
133        }
134      }
135    }
136    return regions;
137  }
138
139  public TreeSet<byte[]> getSplits() {
140    return splits;
141  }
142
143  public Multimap<byte[], R> getStarts() {
144    return starts;
145  }
146
147  /**
148   * Find specified number of top ranges in a big overlap group. It could return less if there are
149   * not that many top ranges. Once these top ranges are excluded, the big overlap group will be
150   * broken into ranges with no overlapping, or smaller overlapped groups, and most likely some
151   * holes.
152   * @param bigOverlap a list of ranges that overlap with each other
153   * @param count      the max number of ranges to find
154   * @return a list of ranges that overlap with most others
155   */
156  public static <R extends KeyRange> List<R> findBigRanges(Collection<R> bigOverlap, int count) {
157    List<R> bigRanges = new ArrayList<>();
158
159    // The key is the count of overlaps,
160    // The value is a list of ranges that have that many overlaps
161    TreeMap<Integer, List<R>> overlapRangeMap = new TreeMap<>();
162    for (R r : bigOverlap) {
163      // Calculates the # of overlaps for each region
164      // and populates rangeOverlapMap
165      byte[] startKey = r.getStartKey();
166      byte[] endKey = specialEndKey(r);
167
168      int overlappedRegions = 0;
169      for (R rr : bigOverlap) {
170        byte[] start = rr.getStartKey();
171        byte[] end = specialEndKey(rr);
172
173        if (
174          BYTES_COMPARATOR.compare(startKey, end) < 0 && BYTES_COMPARATOR.compare(endKey, start) > 0
175        ) {
176          overlappedRegions++;
177        }
178      }
179
180      // One region always overlaps with itself,
181      // so overlappedRegions should be more than 1
182      // for actual overlaps.
183      if (overlappedRegions > 1) {
184        Integer key = Integer.valueOf(overlappedRegions);
185        List<R> ranges = overlapRangeMap.get(key);
186        if (ranges == null) {
187          ranges = new ArrayList<>();
188          overlapRangeMap.put(key, ranges);
189        }
190        ranges.add(r);
191      }
192    }
193    int toBeAdded = count;
194    for (Integer key : overlapRangeMap.descendingKeySet()) {
195      List<R> chunk = overlapRangeMap.get(key);
196      int chunkSize = chunk.size();
197      if (chunkSize <= toBeAdded) {
198        bigRanges.addAll(chunk);
199        toBeAdded -= chunkSize;
200        if (toBeAdded > 0) continue;
201      } else {
202        // Try to use the middle chunk in case the overlapping is
203        // chained, for example: [a, c), [b, e), [d, g), [f h)...
204        // In such a case, sideline the middle chunk will break
205        // the group efficiently.
206        int start = (chunkSize - toBeAdded) / 2;
207        int end = start + toBeAdded;
208        for (int i = start; i < end; i++) {
209          bigRanges.add(chunk.get(i));
210        }
211      }
212      break;
213    }
214    return bigRanges;
215  }
216}