001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.HashMap;
023import java.util.List;
024import java.util.Map;
025import org.apache.hadoop.conf.Configuration;
026import org.apache.hadoop.hbase.ClusterMetrics;
027import org.apache.hadoop.hbase.HConstants;
028import org.apache.hadoop.hbase.RegionMetrics;
029import org.apache.hadoop.hbase.ScheduledChore;
030import org.apache.hadoop.hbase.ServerMetrics;
031import org.apache.hadoop.hbase.ServerName;
032import org.apache.hadoop.hbase.Stoppable;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.PerClientRandomNonceGenerator;
035import org.apache.hadoop.hbase.client.RegionInfo;
036import org.apache.yetus.audience.InterfaceAudience;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils;
041
042/**
043 * This chore, every time it runs, will try to recover regions with high store ref count by
044 * reopening them
045 */
046@InterfaceAudience.Private
047public class RegionsRecoveryChore extends ScheduledChore {
048
049  private static final Logger LOG = LoggerFactory.getLogger(RegionsRecoveryChore.class);
050
051  private static final String REGIONS_RECOVERY_CHORE_NAME = "RegionsRecoveryChore";
052
053  private static final String ERROR_REOPEN_REGIONS_MSG =
054    "Error reopening regions with high storeRefCount.";
055
056  private final HMaster hMaster;
057  private final int storeFileRefCountThreshold;
058
059  private static final PerClientRandomNonceGenerator NONCE_GENERATOR =
060    PerClientRandomNonceGenerator.get();
061
062  /**
063   * Construct RegionsRecoveryChore with provided params
064   * @param stopper       When {@link Stoppable#isStopped()} is true, this chore will cancel and
065   *                      cleanup
066   * @param configuration The configuration params to be used
067   * @param hMaster       HMaster instance to initiate RegionTableRegions
068   */
069  RegionsRecoveryChore(final Stoppable stopper, final Configuration configuration,
070    final HMaster hMaster) {
071    super(REGIONS_RECOVERY_CHORE_NAME, stopper, configuration
072      .getInt(HConstants.REGIONS_RECOVERY_INTERVAL, HConstants.DEFAULT_REGIONS_RECOVERY_INTERVAL));
073    this.hMaster = hMaster;
074    this.storeFileRefCountThreshold = configuration.getInt(
075      HConstants.STORE_FILE_REF_COUNT_THRESHOLD, HConstants.DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD);
076  }
077
078  @Override
079  protected void chore() {
080    if (LOG.isTraceEnabled()) {
081      LOG.trace(
082        "Starting up Regions Recovery chore for reopening regions based on storeFileRefCount...");
083    }
084    try {
085      if (!hMaster.isInitialized()) {
086        if (LOG.isDebugEnabled()) {
087          LOG.debug("Skipping regions recovery chore because master is not initialized yet");
088        }
089        return;
090      }
091      // only if storeFileRefCountThreshold > 0, consider the feature turned on
092      if (storeFileRefCountThreshold > 0) {
093        final ClusterMetrics clusterMetrics = hMaster.getClusterMetrics();
094        final Map<ServerName, ServerMetrics> serverMetricsMap =
095          clusterMetrics.getLiveServerMetrics();
096        final Map<TableName, List<byte[]>> tableToReopenRegionsMap =
097          getTableToRegionsByRefCount(serverMetricsMap);
098        if (MapUtils.isNotEmpty(tableToReopenRegionsMap)) {
099          tableToReopenRegionsMap.forEach((tableName, regionNames) -> {
100            try {
101              LOG.warn("Reopening regions due to high storeFileRefCount. "
102                + "TableName: {}, numOfRegions: {}", tableName, regionNames.size());
103              hMaster.reopenRegionsThrottled(tableName, regionNames,
104                NONCE_GENERATOR.getNonceGroup(), NONCE_GENERATOR.newNonce());
105            } catch (IOException e) {
106              LOG.error("{} tableName: {}, regionNames: {}", ERROR_REOPEN_REGIONS_MSG, tableName,
107                regionNames, e);
108            }
109          });
110        }
111      } else {
112        if (LOG.isDebugEnabled()) {
113          LOG.debug(
114            "Reopening regions with very high storeFileRefCount is disabled. "
115              + "Provide threshold value > 0 for {} to enable it.",
116            HConstants.STORE_FILE_REF_COUNT_THRESHOLD);
117        }
118      }
119    } catch (Exception e) {
120      LOG.error("Error while reopening regions based on storeRefCount threshold", e);
121    }
122    if (LOG.isTraceEnabled()) {
123      LOG.trace(
124        "Exiting Regions Recovery chore for reopening regions based on storeFileRefCount...");
125    }
126  }
127
128  private Map<TableName, List<byte[]>>
129    getTableToRegionsByRefCount(final Map<ServerName, ServerMetrics> serverMetricsMap) {
130    final Map<TableName, List<byte[]>> tableToReopenRegionsMap = new HashMap<>();
131    for (ServerMetrics serverMetrics : serverMetricsMap.values()) {
132      Map<byte[], RegionMetrics> regionMetricsMap = serverMetrics.getRegionMetrics();
133      for (RegionMetrics regionMetrics : regionMetricsMap.values()) {
134        // For each region, each compacted store file can have different ref counts
135        // We need to find maximum of all such ref counts and if that max count of compacted
136        // store files is beyond a threshold value, we should reopen the region.
137        // Here, we take max ref count of all compacted store files and not the cumulative
138        // count of all compacted store files
139        final int maxCompactedStoreFileRefCount = regionMetrics.getMaxCompactedStoreFileRefCount();
140
141        if (maxCompactedStoreFileRefCount > storeFileRefCountThreshold) {
142          final byte[] regionName = regionMetrics.getRegionName();
143          prepareTableToReopenRegionsMap(tableToReopenRegionsMap, regionName,
144            maxCompactedStoreFileRefCount);
145        }
146      }
147    }
148    return tableToReopenRegionsMap;
149  }
150
151  private void prepareTableToReopenRegionsMap(
152    final Map<TableName, List<byte[]>> tableToReopenRegionsMap, final byte[] regionName,
153    final int regionStoreRefCount) {
154    final RegionInfo regionInfo = hMaster.getAssignmentManager().getRegionInfo(regionName);
155    final TableName tableName = regionInfo.getTable();
156    if (TableName.isMetaTableName(tableName)) {
157      // Do not reopen regions of meta table even if it has
158      // high store file reference count
159      return;
160    }
161    LOG.warn("Region {} for Table {} has high storeFileRefCount {}, considering it for reopen..",
162      regionInfo.getRegionNameAsString(), tableName, regionStoreRefCount);
163    tableToReopenRegionsMap.computeIfAbsent(tableName, (key) -> new ArrayList<>()).add(regionName);
164  }
165}