001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.janitor;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.Collection;
023import java.util.Collections;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Map;
027import java.util.Optional;
028import java.util.Set;
029import java.util.SortedSet;
030import java.util.TreeSet;
031import java.util.stream.Collectors;
032import org.apache.hadoop.fs.Path;
033import org.apache.hadoop.hbase.HConstants;
034import org.apache.hadoop.hbase.MetaTableAccessor;
035import org.apache.hadoop.hbase.TableName;
036import org.apache.hadoop.hbase.client.RegionInfo;
037import org.apache.hadoop.hbase.client.RegionInfoBuilder;
038import org.apache.hadoop.hbase.client.RegionReplicaUtil;
039import org.apache.hadoop.hbase.client.TableDescriptor;
040import org.apache.hadoop.hbase.exceptions.MergeRegionException;
041import org.apache.hadoop.hbase.master.MasterFileSystem;
042import org.apache.hadoop.hbase.master.MasterServices;
043import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
044import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
045import org.apache.hadoop.hbase.util.Bytes;
046import org.apache.hadoop.hbase.util.CommonFSUtils;
047import org.apache.hadoop.hbase.util.Pair;
048import org.apache.yetus.audience.InterfaceAudience;
049import org.slf4j.Logger;
050import org.slf4j.LoggerFactory;
051
052import org.apache.hbase.thirdparty.com.google.common.collect.ArrayListMultimap;
053import org.apache.hbase.thirdparty.com.google.common.collect.ListMultimap;
054
055/**
056 * Server-side fixing of bad or inconsistent state in hbase:meta. Distinct from MetaTableAccessor
057 * because {@link MetaTableAccessor} is about low-level manipulations driven by the Master. This
058 * class MetaFixer is employed by the Master and it 'knows' about holes and orphans and encapsulates
059 * their fixing on behalf of the Master.
060 */
061@InterfaceAudience.Private
062public class MetaFixer {
063  private static final Logger LOG = LoggerFactory.getLogger(MetaFixer.class);
064  private static final String MAX_MERGE_COUNT_KEY = "hbase.master.metafixer.max.merge.count";
065  private static final int MAX_MERGE_COUNT_DEFAULT = 64;
066
067  private final MasterServices masterServices;
068  /**
069   * Maximum for many regions to merge at a time.
070   */
071  private final int maxMergeCount;
072
073  public MetaFixer(MasterServices masterServices) {
074    this.masterServices = masterServices;
075    this.maxMergeCount =
076      this.masterServices.getConfiguration().getInt(MAX_MERGE_COUNT_KEY, MAX_MERGE_COUNT_DEFAULT);
077  }
078
079  public void fix() throws IOException {
080    CatalogJanitorReport report = this.masterServices.getCatalogJanitor().getLastReport();
081    if (report == null) {
082      LOG.info("CatalogJanitor has not generated a report yet; run 'catalogjanitor_run' in "
083        + "shell or wait until CatalogJanitor chore runs.");
084      return;
085    }
086    fixHoles(report);
087    fixOverlaps(report);
088    // Run the ReplicationBarrierCleaner here; it may clear out rep_barrier rows which
089    // can help cleaning up damaged hbase:meta.
090    this.masterServices.runReplicationBarrierCleaner();
091  }
092
093  /**
094   * If hole, it papers it over by adding a region in the filesystem and to hbase:meta. Does not
095   * assign.
096   */
097  void fixHoles(CatalogJanitorReport report) {
098    final List<Pair<RegionInfo, RegionInfo>> holes = report.getHoles();
099    if (holes.isEmpty()) {
100      LOG.info("CatalogJanitor Report contains no holes to fix. Skipping.");
101      return;
102    }
103
104    LOG.info("Identified {} region holes to fix. Detailed fixup progress logged at DEBUG.",
105      holes.size());
106
107    final List<RegionInfo> newRegionInfos = createRegionInfosForHoles(holes);
108    final List<RegionInfo> newMetaEntries = createMetaEntries(masterServices, newRegionInfos);
109    createRegionDirectories(masterServices, newMetaEntries);
110    final TransitRegionStateProcedure[] assignProcedures =
111      masterServices.getAssignmentManager().createRoundRobinAssignProcedures(newMetaEntries);
112
113    masterServices.getMasterProcedureExecutor().submitProcedures(assignProcedures);
114    LOG.info("Scheduled {}/{} new regions for assignment.", assignProcedures.length, holes.size());
115  }
116
117  /**
118   * Create a new {@link RegionInfo} corresponding to each provided "hole" pair.
119   */
120  private static List<RegionInfo>
121    createRegionInfosForHoles(final List<Pair<RegionInfo, RegionInfo>> holes) {
122    final List<RegionInfo> newRegionInfos = holes.stream().map(MetaFixer::getHoleCover)
123      .filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList());
124    LOG.debug("Constructed {}/{} RegionInfo descriptors corresponding to identified holes.",
125      newRegionInfos.size(), holes.size());
126    return newRegionInfos;
127  }
128
129  /**
130   * @return Attempts to calculate a new {@link RegionInfo} that covers the region range described
131   *         in {@code hole}.
132   */
133  private static Optional<RegionInfo> getHoleCover(Pair<RegionInfo, RegionInfo> hole) {
134    final RegionInfo left = hole.getFirst();
135    final RegionInfo right = hole.getSecond();
136
137    if (left.getTable().equals(right.getTable())) {
138      // Simple case.
139      if (Bytes.compareTo(left.getEndKey(), right.getStartKey()) >= 0) {
140        LOG.warn("Skipping hole fix; left-side endKey is not less than right-side startKey;"
141          + " left=<{}>, right=<{}>", left, right);
142        return Optional.empty();
143      }
144      return Optional.of(buildRegionInfo(left.getTable(), left.getEndKey(), right.getStartKey()));
145    }
146
147    final boolean leftUndefined = left.equals(RegionInfoBuilder.UNDEFINED);
148    final boolean rightUndefined = right.equals(RegionInfoBuilder.UNDEFINED);
149    final boolean last = left.isLast();
150    final boolean first = right.isFirst();
151    if (leftUndefined && rightUndefined) {
152      LOG.warn("Skipping hole fix; both the hole left-side and right-side RegionInfos are "
153        + "UNDEFINED; left=<{}>, right=<{}>", left, right);
154      return Optional.empty();
155    }
156    if (leftUndefined || last) {
157      return Optional
158        .of(buildRegionInfo(right.getTable(), HConstants.EMPTY_START_ROW, right.getStartKey()));
159    }
160    if (rightUndefined || first) {
161      return Optional
162        .of(buildRegionInfo(left.getTable(), left.getEndKey(), HConstants.EMPTY_END_ROW));
163    }
164    LOG.warn("Skipping hole fix; don't know what to do with left=<{}>, right=<{}>", left, right);
165    return Optional.empty();
166  }
167
168  private static RegionInfo buildRegionInfo(TableName tn, byte[] start, byte[] end) {
169    return RegionInfoBuilder.newBuilder(tn).setStartKey(start).setEndKey(end).build();
170  }
171
172  /**
173   * Create entries in the {@code hbase:meta} for each provided {@link RegionInfo}. Best effort.
174   * @param masterServices used to connect to {@code hbase:meta}
175   * @param newRegionInfos the new {@link RegionInfo} entries to add to the filesystem
176   * @return a list of {@link RegionInfo} entries for which {@code hbase:meta} entries were
177   *         successfully created
178   */
179  private static List<RegionInfo> createMetaEntries(final MasterServices masterServices,
180    final List<RegionInfo> newRegionInfos) {
181
182    final List<Either<List<RegionInfo>, IOException>> addMetaEntriesResults =
183      newRegionInfos.stream().map(regionInfo -> {
184        try {
185          TableDescriptor td = masterServices.getTableDescriptors().get(regionInfo.getTable());
186
187          // Add replicas if needed
188          // we need to create regions with replicaIds starting from 1
189          List<RegionInfo> newRegions = RegionReplicaUtil
190            .addReplicas(Collections.singletonList(regionInfo), 1, td.getRegionReplication());
191
192          // Add regions to META
193          MetaTableAccessor.addRegionsToMeta(masterServices.getConnection(), newRegions,
194            td.getRegionReplication());
195
196          return Either.<List<RegionInfo>, IOException> ofLeft(newRegions);
197        } catch (IOException e) {
198          return Either.<List<RegionInfo>, IOException> ofRight(e);
199        }
200      }).collect(Collectors.toList());
201    final List<RegionInfo> createMetaEntriesSuccesses =
202      addMetaEntriesResults.stream().filter(Either::hasLeft).map(Either::getLeft)
203        .flatMap(List::stream).collect(Collectors.toList());
204    final List<IOException> createMetaEntriesFailures = addMetaEntriesResults.stream()
205      .filter(Either::hasRight).map(Either::getRight).collect(Collectors.toList());
206    LOG.debug("Added {}/{} entries to hbase:meta", createMetaEntriesSuccesses.size(),
207      newRegionInfos.size());
208
209    if (!createMetaEntriesFailures.isEmpty()) {
210      LOG.warn(
211        "Failed to create entries in hbase:meta for {}/{} RegionInfo descriptors. First"
212          + " failure message included; full list of failures with accompanying stack traces is"
213          + " available at log level DEBUG. message={}",
214        createMetaEntriesFailures.size(), addMetaEntriesResults.size(),
215        createMetaEntriesFailures.get(0).getMessage());
216      if (LOG.isDebugEnabled()) {
217        createMetaEntriesFailures
218          .forEach(ioe -> LOG.debug("Attempt to fix region hole in hbase:meta failed.", ioe));
219      }
220    }
221
222    return createMetaEntriesSuccesses;
223  }
224
225  private static void createRegionDirectories(final MasterServices masterServices,
226    final List<RegionInfo> regions) {
227    if (regions.isEmpty()) {
228      return;
229    }
230    final MasterFileSystem mfs = masterServices.getMasterFileSystem();
231    final Path rootDir = mfs.getRootDir();
232    for (RegionInfo regionInfo : regions) {
233      if (regionInfo.getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) {
234        try {
235          Path tableDir = CommonFSUtils.getTableDir(rootDir, regionInfo.getTable());
236          HRegionFileSystem.createRegionOnFileSystem(masterServices.getConfiguration(),
237            mfs.getFileSystem(), tableDir, regionInfo);
238        } catch (IOException e) {
239          LOG.warn("Failed to create region directory for {}: {}",
240            regionInfo.getRegionNameAsString(), e.getMessage(), e);
241        }
242      }
243    }
244  }
245
246  /**
247   * Fix overlaps noted in CJ consistency report.
248   */
249  List<Long> fixOverlaps(CatalogJanitorReport report) throws IOException {
250    List<Long> pidList = new ArrayList<>();
251    for (Set<RegionInfo> regions : calculateMerges(maxMergeCount, report.getOverlaps())) {
252      RegionInfo[] regionsArray = regions.toArray(new RegionInfo[] {});
253      try {
254        pidList.add(this.masterServices.mergeRegions(regionsArray, true, HConstants.NO_NONCE,
255          HConstants.NO_NONCE));
256      } catch (MergeRegionException mre) {
257        LOG.warn("Failed overlap fix of {}", regionsArray, mre);
258      }
259    }
260    return pidList;
261  }
262
263  /**
264   * Run through <code>overlaps</code> and return a list of merges to run. Presumes overlaps are
265   * ordered (which they are coming out of the CatalogJanitor consistency report).
266   * @param maxMergeCount Maximum regions to merge at a time (avoid merging 100k regions in one go!)
267   */
268  static List<SortedSet<RegionInfo>> calculateMerges(int maxMergeCount,
269    List<Pair<RegionInfo, RegionInfo>> overlaps) {
270    if (overlaps.isEmpty()) {
271      LOG.debug("No overlaps.");
272      return Collections.emptyList();
273    }
274    List<SortedSet<RegionInfo>> merges = new ArrayList<>();
275    // First group overlaps by table then calculate merge table by table.
276    ListMultimap<TableName, Pair<RegionInfo, RegionInfo>> overlapGroups =
277      ArrayListMultimap.create();
278    for (Pair<RegionInfo, RegionInfo> pair : overlaps) {
279      overlapGroups.put(pair.getFirst().getTable(), pair);
280    }
281    for (Map.Entry<TableName, Collection<Pair<RegionInfo, RegionInfo>>> entry : overlapGroups
282      .asMap().entrySet()) {
283      calculateTableMerges(maxMergeCount, merges, entry.getValue());
284    }
285    return merges;
286  }
287
288  private static void calculateTableMerges(int maxMergeCount, List<SortedSet<RegionInfo>> merges,
289    Collection<Pair<RegionInfo, RegionInfo>> overlaps) {
290    SortedSet<RegionInfo> currentMergeSet = new TreeSet<>();
291    HashSet<RegionInfo> regionsInMergeSet = new HashSet<>();
292    RegionInfo regionInfoWithlargestEndKey = null;
293    for (Pair<RegionInfo, RegionInfo> pair : overlaps) {
294      if (regionInfoWithlargestEndKey != null) {
295        if (
296          !isOverlap(regionInfoWithlargestEndKey, pair) || currentMergeSet.size() >= maxMergeCount
297        ) {
298          // Log when we cut-off-merge because we hit the configured maximum merge limit.
299          if (currentMergeSet.size() >= maxMergeCount) {
300            LOG.warn("Ran into maximum-at-a-time merges limit={}", maxMergeCount);
301          }
302
303          // In the case of the merge set contains only 1 region or empty, it does not need to
304          // submit this merge request as no merge is going to happen. currentMergeSet can be
305          // reused in this case.
306          if (currentMergeSet.size() <= 1) {
307            for (RegionInfo ri : currentMergeSet) {
308              regionsInMergeSet.remove(ri);
309            }
310            currentMergeSet.clear();
311          } else {
312            merges.add(currentMergeSet);
313            currentMergeSet = new TreeSet<>();
314          }
315        }
316      }
317
318      // Do not add the same region into multiple merge set, this will fail
319      // the second merge request.
320      if (!regionsInMergeSet.contains(pair.getFirst())) {
321        currentMergeSet.add(pair.getFirst());
322        regionsInMergeSet.add(pair.getFirst());
323      }
324      if (!regionsInMergeSet.contains(pair.getSecond())) {
325        currentMergeSet.add(pair.getSecond());
326        regionsInMergeSet.add(pair.getSecond());
327      }
328
329      regionInfoWithlargestEndKey = getRegionInfoWithLargestEndKey(
330        getRegionInfoWithLargestEndKey(pair.getFirst(), pair.getSecond()),
331        regionInfoWithlargestEndKey);
332    }
333    merges.add(currentMergeSet);
334  }
335
336  /**
337   * @return Either <code>a</code> or <code>b</code>, whichever has the endkey that is furthest
338   *         along in the Table.
339   */
340  static RegionInfo getRegionInfoWithLargestEndKey(RegionInfo a, RegionInfo b) {
341    if (a == null) {
342      // b may be null.
343      return b;
344    }
345    if (b == null) {
346      // Both are null. The return is not-defined.
347      return a;
348    }
349    if (!a.getTable().equals(b.getTable())) {
350      // This is an odd one. This should be the right answer.
351      return b;
352    }
353    if (a.isLast()) {
354      return a;
355    }
356    if (b.isLast()) {
357      return b;
358    }
359    int compare = Bytes.compareTo(a.getEndKey(), b.getEndKey());
360    return compare == 0 || compare > 0 ? a : b;
361  }
362
363  /**
364   * @return True if an overlap found between passed in <code>ri</code> and the <code>pair</code>.
365   *         Does NOT check the pairs themselves overlap.
366   */
367  static boolean isOverlap(RegionInfo ri, Pair<RegionInfo, RegionInfo> pair) {
368    if (ri == null || pair == null) {
369      // Can't be an overlap in either of these cases.
370      return false;
371    }
372    return ri.isOverlap(pair.getFirst()) || ri.isOverlap(pair.getSecond());
373  }
374
375  /**
376   * A union over {@link L} and {@link R}.
377   */
378  private static class Either<L, R> {
379    private final L left;
380    private final R right;
381
382    public static <L, R> Either<L, R> ofLeft(L left) {
383      return new Either<>(left, null);
384    }
385
386    public static <L, R> Either<L, R> ofRight(R right) {
387      return new Either<>(null, right);
388    }
389
390    Either(L left, R right) {
391      this.left = left;
392      this.right = right;
393    }
394
395    public boolean hasLeft() {
396      return left != null;
397    }
398
399    public L getLeft() {
400      if (!hasLeft()) {
401        throw new IllegalStateException("Either contains no left.");
402      }
403      return left;
404    }
405
406    public boolean hasRight() {
407      return right != null;
408    }
409
410    public R getRight() {
411      if (!hasRight()) {
412        throw new IllegalStateException("Either contains no right.");
413      }
414      return right;
415    }
416  }
417}