001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.hbck;
019
020import java.io.IOException;
021import java.time.Instant;
022import java.util.HashMap;
023import java.util.HashSet;
024import java.util.LinkedList;
025import java.util.List;
026import java.util.Map;
027import java.util.Set;
028import org.apache.hadoop.fs.FileSystem;
029import org.apache.hadoop.fs.Path;
030import org.apache.hadoop.hbase.MetaTableAccessor;
031import org.apache.hadoop.hbase.ScheduledChore;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.client.RegionInfo;
034import org.apache.hadoop.hbase.client.TableState;
035import org.apache.hadoop.hbase.master.MasterServices;
036import org.apache.hadoop.hbase.master.RegionState;
037import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
038import org.apache.hadoop.hbase.util.FSUtils;
039import org.apache.hadoop.hbase.util.HbckRegionInfo;
040import org.apache.hadoop.hbase.util.Pair;
041import org.apache.yetus.audience.InterfaceAudience;
042import org.apache.yetus.audience.InterfaceStability;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046/**
047 * Used to do the hbck checking job at master side.
048 */
049@InterfaceAudience.Private
050@InterfaceStability.Evolving
051public class HbckChore extends ScheduledChore {
052  private static final Logger LOG = LoggerFactory.getLogger(HbckChore.class.getName());
053
054  private static final String HBCK_CHORE_INTERVAL = "hbase.master.hbck.chore.interval";
055  private static final int DEFAULT_HBCK_CHORE_INTERVAL = 60 * 60 * 1000;
056
057  private final MasterServices master;
058
059  /**
060   * Saved report from last time this chore ran. Check its date.
061   */
062  private volatile HbckReport lastReport = null;
063
064  /**
065   * When running, the "snapshot" may be changed when this round's checking finish.
066   */
067  private volatile boolean running = false;
068
069  private boolean disabled = false;
070
071  public HbckChore(MasterServices master) {
072    super("HbckChore-", master,
073      master.getConfiguration().getInt(HBCK_CHORE_INTERVAL, DEFAULT_HBCK_CHORE_INTERVAL));
074    this.master = master;
075    int interval =
076      master.getConfiguration().getInt(HBCK_CHORE_INTERVAL, DEFAULT_HBCK_CHORE_INTERVAL);
077    if (interval <= 0) {
078      LOG.warn(HBCK_CHORE_INTERVAL + " is <=0 hence disabling hbck chore");
079      disableChore();
080    }
081  }
082
083  /**
084   * Returns Returns last published Report that comes of last successful execution of this chore.
085   */
086  public HbckReport getLastReport() {
087    return lastReport;
088  }
089
090  @Override
091  protected synchronized void chore() {
092    if (isDisabled() || isRunning()) {
093      LOG.warn("hbckChore is either disabled or is already running. Can't run the chore");
094      return;
095    }
096    running = true;
097    final HbckReport report = new HbckReport();
098    report.setCheckingStartTimestamp(Instant.ofEpochMilli(EnvironmentEdgeManager.currentTime()));
099    try {
100      loadRegionsFromInMemoryState(report);
101      loadRegionsFromRSReport(report);
102      try {
103        loadRegionsFromFS(scanForMergedParentRegions(), report);
104      } catch (IOException e) {
105        LOG.warn("Failed to load the regions from filesystem", e);
106      }
107    } catch (Throwable t) {
108      LOG.warn("Unexpected", t);
109    }
110    report.setCheckingEndTimestamp(Instant.ofEpochMilli(EnvironmentEdgeManager.currentTime()));
111    this.lastReport = report;
112    running = false;
113    updateAssignmentManagerMetrics(report);
114  }
115
116  /**
117   * Request execution of this chore's action.
118   * @return {@code true} if the chore was executed, {@code false} if the chore is disabled or
119   *         already running.
120   */
121  public boolean runChore() {
122    // This function does the sanity checks of making sure the chore is not run when it is
123    // disabled or when it's already running. It returns whether the chore was actually run or not.
124    if (isDisabled() || isRunning()) {
125      if (isDisabled()) {
126        LOG.warn("hbck chore is disabled! Set " + HBCK_CHORE_INTERVAL + " > 0 to enable it.");
127      } else {
128        LOG.warn("hbck chore already running. Can't run till it finishes.");
129      }
130      return false;
131    }
132    chore();
133    return true;
134  }
135
136  private void disableChore() {
137    this.disabled = true;
138  }
139
140  public boolean isDisabled() {
141    return this.disabled;
142  }
143
144  /**
145   * Scan hbase:meta to get set of merged parent regions, this is a very heavy scan.
146   * @return Return generated {@link HashSet}
147   */
148  private HashSet<String> scanForMergedParentRegions() throws IOException {
149    HashSet<String> mergedParentRegions = new HashSet<>();
150    // Null tablename means scan all of meta.
151    MetaTableAccessor.scanMetaForTableRegions(this.master.getConnection(), r -> {
152      List<RegionInfo> mergeParents = MetaTableAccessor.getMergeRegions(r.rawCells());
153      if (mergeParents != null) {
154        for (RegionInfo mergeRegion : mergeParents) {
155          if (mergeRegion != null) {
156            // This region is already being merged
157            mergedParentRegions.add(mergeRegion.getEncodedName());
158          }
159        }
160      }
161      return true;
162    }, null);
163    return mergedParentRegions;
164  }
165
166  private void loadRegionsFromInMemoryState(final HbckReport report) {
167    List<RegionState> regionStates =
168      master.getAssignmentManager().getRegionStates().getRegionStates();
169    for (RegionState regionState : regionStates) {
170      RegionInfo regionInfo = regionState.getRegion();
171      if (
172        master.getTableStateManager().isTableState(regionInfo.getTable(), TableState.State.DISABLED)
173      ) {
174        report.getDisabledTableRegions().add(regionInfo.getRegionNameAsString());
175      }
176      // Check both state and regioninfo for split status, see HBASE-26383
177      if (regionState.isSplit() || regionInfo.isSplit()) {
178        report.getSplitParentRegions().add(regionInfo.getRegionNameAsString());
179      }
180      HbckRegionInfo.MetaEntry metaEntry = new HbckRegionInfo.MetaEntry(regionInfo,
181        regionState.getServerName(), regionState.getStamp());
182      report.getRegionInfoMap().put(regionInfo.getEncodedName(), new HbckRegionInfo(metaEntry));
183    }
184    LOG.info("Loaded {} regions ({} disabled, {} split parents) from in-memory state",
185      regionStates.size(), report.getDisabledTableRegions().size(),
186      report.getSplitParentRegions().size());
187    if (LOG.isDebugEnabled()) {
188      Map<RegionState.State, Integer> stateCountMap = new HashMap<>();
189      for (RegionState regionState : regionStates) {
190        stateCountMap.compute(regionState.getState(), (k, v) -> (v == null) ? 1 : v + 1);
191      }
192      StringBuffer sb = new StringBuffer();
193      sb.append("Regions by state: ");
194      stateCountMap.entrySet().forEach(e -> {
195        sb.append(e.getKey());
196        sb.append('=');
197        sb.append(e.getValue());
198        sb.append(' ');
199      });
200      LOG.debug(sb.toString());
201    }
202    if (LOG.isTraceEnabled()) {
203      for (RegionState regionState : regionStates) {
204        LOG.trace("{}: {}, serverName={}", regionState.getRegion(), regionState.getState(),
205          regionState.getServerName());
206      }
207    }
208  }
209
210  private void loadRegionsFromRSReport(final HbckReport report) {
211    int numRegions = 0;
212    Map<ServerName, Set<byte[]>> rsReports = master.getAssignmentManager().getRSReports();
213    for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
214      ServerName serverName = entry.getKey();
215      for (byte[] regionName : entry.getValue()) {
216        String encodedRegionName = RegionInfo.encodeRegionName(regionName);
217        HbckRegionInfo hri = report.getRegionInfoMap().get(encodedRegionName);
218        if (hri == null) {
219          report.getOrphanRegionsOnRS().put(RegionInfo.getRegionNameAsString(regionName),
220            serverName);
221          continue;
222        }
223        hri.addServer(hri.getMetaEntry(), serverName);
224      }
225      numRegions += entry.getValue().size();
226    }
227    LOG.info("Loaded {} regions from {} regionservers' reports and found {} orphan regions",
228      numRegions, rsReports.size(), report.getOrphanRegionsOnRS().size());
229
230    for (Map.Entry<String, HbckRegionInfo> entry : report.getRegionInfoMap().entrySet()) {
231      HbckRegionInfo hri = entry.getValue();
232      ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
233      if (locationInMeta == null) {
234        continue;
235      }
236      if (hri.getDeployedOn().size() == 0) {
237        // skip the offline region which belong to disabled table.
238        if (report.getDisabledTableRegions().contains(hri.getRegionNameAsString())) {
239          continue;
240        }
241        // skip the split parent regions
242        if (report.getSplitParentRegions().contains(hri.getRegionNameAsString())) {
243          continue;
244        }
245        // Master thought this region opened, but no regionserver reported it.
246        report.getInconsistentRegions().put(hri.getRegionNameAsString(),
247          new Pair<>(locationInMeta, new LinkedList<>()));
248      } else if (hri.getDeployedOn().size() > 1) {
249        // More than one regionserver reported opened this region
250        report.getInconsistentRegions().put(hri.getRegionNameAsString(),
251          new Pair<>(locationInMeta, hri.getDeployedOn()));
252      } else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
253        // Master thought this region opened on Server1, but regionserver reported Server2
254        report.getInconsistentRegions().put(hri.getRegionNameAsString(),
255          new Pair<>(locationInMeta, hri.getDeployedOn()));
256      }
257    }
258  }
259
260  private void loadRegionsFromFS(final HashSet<String> mergedParentRegions, final HbckReport report)
261    throws IOException {
262    Path rootDir = master.getMasterFileSystem().getRootDir();
263    FileSystem fs = master.getMasterFileSystem().getFileSystem();
264
265    int numRegions = 0;
266    List<Path> tableDirs = FSUtils.getTableDirs(fs, rootDir);
267    for (Path tableDir : tableDirs) {
268      List<Path> regionDirs = FSUtils.getRegionDirs(fs, tableDir);
269      for (Path regionDir : regionDirs) {
270        String encodedRegionName = regionDir.getName();
271        if (encodedRegionName == null) {
272          LOG.warn("Failed get of encoded name from {}", regionDir);
273          continue;
274        }
275        HbckRegionInfo hri = report.getRegionInfoMap().get(encodedRegionName);
276        // If it is not in in-memory database and not a merged region,
277        // report it as an orphan region.
278        if (hri == null && !mergedParentRegions.contains(encodedRegionName)) {
279          report.getOrphanRegionsOnFS().put(encodedRegionName, regionDir);
280          continue;
281        }
282      }
283      numRegions += regionDirs.size();
284    }
285    LOG.info("Loaded {} tables {} regions from filesystem and found {} orphan regions",
286      tableDirs.size(), numRegions, report.getOrphanRegionsOnFS().size());
287  }
288
289  private void updateAssignmentManagerMetrics(final HbckReport report) {
290    master.getAssignmentManager().getAssignmentManagerMetrics()
291      .updateOrphanRegionsOnRs(report.getOrphanRegionsOnRS().size());
292    master.getAssignmentManager().getAssignmentManagerMetrics()
293      .updateOrphanRegionsOnFs(report.getOrphanRegionsOnFS().size());
294    master.getAssignmentManager().getAssignmentManagerMetrics()
295      .updateInconsistentRegions(report.getInconsistentRegions().size());
296  }
297
298  /**
299   * When running, the HBCK report may be changed later.
300   */
301  public boolean isRunning() {
302    return running;
303  }
304}