001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.HashMap;
023import java.util.List;
024import java.util.Map;
025import org.apache.hadoop.conf.Configuration;
026import org.apache.hadoop.hbase.ClusterMetrics;
027import org.apache.hadoop.hbase.HConstants;
028import org.apache.hadoop.hbase.RegionMetrics;
029import org.apache.hadoop.hbase.ScheduledChore;
030import org.apache.hadoop.hbase.ServerMetrics;
031import org.apache.hadoop.hbase.ServerName;
032import org.apache.hadoop.hbase.Stoppable;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.PerClientRandomNonceGenerator;
035import org.apache.hadoop.hbase.client.RegionInfo;
036import org.apache.yetus.audience.InterfaceAudience;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils;
041
042/**
043 * This chore, every time it runs, will try to recover regions with high store ref count by
044 * reopening them
045 */
046@InterfaceAudience.Private
047public class RegionsRecoveryChore extends ScheduledChore {
048
049  private static final Logger LOG = LoggerFactory.getLogger(RegionsRecoveryChore.class);
050
051  private static final String REGIONS_RECOVERY_CHORE_NAME = "RegionsRecoveryChore";
052
053  private static final String ERROR_REOPEN_REIONS_MSG =
054    "Error reopening regions with high storeRefCount. ";
055
056  private final HMaster hMaster;
057  private final int storeFileRefCountThreshold;
058
059  private static final PerClientRandomNonceGenerator NONCE_GENERATOR =
060    PerClientRandomNonceGenerator.get();
061
062  /**
063   * Construct RegionsRecoveryChore with provided params
064   * @param stopper       When {@link Stoppable#isStopped()} is true, this chore will cancel and
065   *                      cleanup
066   * @param configuration The configuration params to be used
067   * @param hMaster       HMaster instance to initiate RegionTableRegions
068   */
069  RegionsRecoveryChore(final Stoppable stopper, final Configuration configuration,
070    final HMaster hMaster) {
071    super(REGIONS_RECOVERY_CHORE_NAME, stopper, configuration
072      .getInt(HConstants.REGIONS_RECOVERY_INTERVAL, HConstants.DEFAULT_REGIONS_RECOVERY_INTERVAL));
073    this.hMaster = hMaster;
074    this.storeFileRefCountThreshold = configuration.getInt(
075      HConstants.STORE_FILE_REF_COUNT_THRESHOLD, HConstants.DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD);
076
077  }
078
079  @Override
080  protected void chore() {
081    if (LOG.isTraceEnabled()) {
082      LOG.trace(
083        "Starting up Regions Recovery chore for reopening regions based on storeFileRefCount...");
084    }
085    try {
086      // only if storeFileRefCountThreshold > 0, consider the feature turned on
087      if (storeFileRefCountThreshold > 0) {
088        final ClusterMetrics clusterMetrics = hMaster.getClusterMetrics();
089        final Map<ServerName, ServerMetrics> serverMetricsMap =
090          clusterMetrics.getLiveServerMetrics();
091        final Map<TableName, List<byte[]>> tableToReopenRegionsMap =
092          getTableToRegionsByRefCount(serverMetricsMap);
093        if (MapUtils.isNotEmpty(tableToReopenRegionsMap)) {
094          tableToReopenRegionsMap.forEach((tableName, regionNames) -> {
095            try {
096              LOG.warn("Reopening regions due to high storeFileRefCount. "
097                + "TableName: {} , noOfRegions: {}", tableName, regionNames.size());
098              hMaster.reopenRegions(tableName, regionNames, NONCE_GENERATOR.getNonceGroup(),
099                NONCE_GENERATOR.newNonce());
100            } catch (IOException e) {
101              LOG.error("{} tableName: {}, regionNames: {}", ERROR_REOPEN_REIONS_MSG, tableName,
102                regionNames, e);
103            }
104          });
105        }
106      } else {
107        if (LOG.isDebugEnabled()) {
108          LOG.debug(
109            "Reopening regions with very high storeFileRefCount is disabled. "
110              + "Provide threshold value > 0 for {} to enable it.",
111            HConstants.STORE_FILE_REF_COUNT_THRESHOLD);
112        }
113      }
114    } catch (Exception e) {
115      LOG.error("Error while reopening regions based on storeRefCount threshold", e);
116    }
117    if (LOG.isTraceEnabled()) {
118      LOG.trace(
119        "Exiting Regions Recovery chore for reopening regions based on storeFileRefCount...");
120    }
121  }
122
123  private Map<TableName, List<byte[]>>
124    getTableToRegionsByRefCount(final Map<ServerName, ServerMetrics> serverMetricsMap) {
125    final Map<TableName, List<byte[]>> tableToReopenRegionsMap = new HashMap<>();
126    for (ServerMetrics serverMetrics : serverMetricsMap.values()) {
127      Map<byte[], RegionMetrics> regionMetricsMap = serverMetrics.getRegionMetrics();
128      for (RegionMetrics regionMetrics : regionMetricsMap.values()) {
129        // For each region, each compacted store file can have different ref counts
130        // We need to find maximum of all such ref counts and if that max count of compacted
131        // store files is beyond a threshold value, we should reopen the region.
132        // Here, we take max ref count of all compacted store files and not the cumulative
133        // count of all compacted store files
134        final int maxCompactedStoreFileRefCount = regionMetrics.getMaxCompactedStoreFileRefCount();
135
136        if (maxCompactedStoreFileRefCount > storeFileRefCountThreshold) {
137          final byte[] regionName = regionMetrics.getRegionName();
138          prepareTableToReopenRegionsMap(tableToReopenRegionsMap, regionName,
139            maxCompactedStoreFileRefCount);
140        }
141      }
142    }
143    return tableToReopenRegionsMap;
144  }
145
146  private void prepareTableToReopenRegionsMap(
147    final Map<TableName, List<byte[]>> tableToReopenRegionsMap, final byte[] regionName,
148    final int regionStoreRefCount) {
149    final RegionInfo regionInfo = hMaster.getAssignmentManager().getRegionInfo(regionName);
150    final TableName tableName = regionInfo.getTable();
151    if (TableName.isMetaTableName(tableName)) {
152      // Do not reopen regions of meta table even if it has
153      // high store file reference count
154      return;
155    }
156    LOG.warn("Region {} for Table {} has high storeFileRefCount {}, considering it for reopen..",
157      regionInfo.getRegionNameAsString(), tableName, regionStoreRefCount);
158    tableToReopenRegionsMap.computeIfAbsent(tableName, (key) -> new ArrayList<>()).add(regionName);
159  }
160}