001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util.hbck;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.Collection;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Set;
027import java.util.concurrent.Callable;
028import java.util.concurrent.ConcurrentSkipListSet;
029import java.util.concurrent.ExecutionException;
030import java.util.concurrent.ExecutorService;
031import java.util.concurrent.Future;
032import java.util.concurrent.atomic.AtomicInteger;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.FileStatus;
035import org.apache.hadoop.fs.FileSystem;
036import org.apache.hadoop.fs.Path;
037import org.apache.hadoop.hbase.HConstants;
038import org.apache.hadoop.hbase.TableName;
039import org.apache.hadoop.hbase.io.hfile.CacheConfig;
040import org.apache.hadoop.hbase.io.hfile.CorruptHFileException;
041import org.apache.hadoop.hbase.io.hfile.HFile;
042import org.apache.hadoop.hbase.mob.MobUtils;
043import org.apache.hadoop.hbase.util.CommonFSUtils;
044import org.apache.hadoop.hbase.util.FSUtils;
045import org.apache.hadoop.hbase.util.FSUtils.FamilyDirFilter;
046import org.apache.hadoop.hbase.util.FSUtils.HFileFilter;
047import org.apache.hadoop.hbase.util.FSUtils.RegionDirFilter;
048import org.apache.hadoop.hbase.util.HbckErrorReporter;
049import org.apache.yetus.audience.InterfaceAudience;
050import org.slf4j.Logger;
051import org.slf4j.LoggerFactory;
052
053/**
054 * This class marches through all of the region's hfiles and verifies that they are all valid files.
055 * One just needs to instantiate the class, use checkTables(List<Path>) and then retrieve the
056 * corrupted hfiles (and quarantined files if in quarantining mode) The implementation currently
057 * parallelizes at the regionDir level.
058 */
059@InterfaceAudience.Private
060public class HFileCorruptionChecker {
061  private static final Logger LOG = LoggerFactory.getLogger(HFileCorruptionChecker.class);
062
063  final Configuration conf;
064  final FileSystem fs;
065  final CacheConfig cacheConf;
066  final ExecutorService executor;
067  final Set<Path> corrupted = new ConcurrentSkipListSet<>();
068  final Set<Path> failures = new ConcurrentSkipListSet<>();
069  final Set<Path> quarantined = new ConcurrentSkipListSet<>();
070  final Set<Path> missing = new ConcurrentSkipListSet<>();
071  final Set<Path> corruptedMobFiles = new ConcurrentSkipListSet<>();
072  final Set<Path> failureMobFiles = new ConcurrentSkipListSet<>();
073  final Set<Path> missedMobFiles = new ConcurrentSkipListSet<>();
074  final Set<Path> quarantinedMobFiles = new ConcurrentSkipListSet<>();
075  final boolean inQuarantineMode;
076  final AtomicInteger hfilesChecked = new AtomicInteger();
077  final AtomicInteger mobFilesChecked = new AtomicInteger();
078
079  public HFileCorruptionChecker(Configuration conf, ExecutorService executor, boolean quarantine)
080    throws IOException {
081    this.conf = conf;
082    this.fs = FileSystem.get(conf);
083    this.cacheConf = CacheConfig.DISABLED;
084    this.executor = executor;
085    this.inQuarantineMode = quarantine;
086  }
087
088  /**
089   * Checks a path to see if it is a valid hfile. n * full Path to an HFile n * This is a
090   * connectivity related exception
091   */
092  protected void checkHFile(Path p) throws IOException {
093    HFile.Reader r = null;
094    try {
095      r = HFile.createReader(fs, p, cacheConf, true, conf);
096    } catch (CorruptHFileException che) {
097      LOG.warn("Found corrupt HFile " + p, che);
098      corrupted.add(p);
099      if (inQuarantineMode) {
100        Path dest = createQuarantinePath(p);
101        LOG.warn("Quarantining corrupt HFile " + p + " into " + dest);
102        boolean success = fs.mkdirs(dest.getParent());
103        success = success ? fs.rename(p, dest) : false;
104        if (!success) {
105          failures.add(p);
106        } else {
107          quarantined.add(dest);
108        }
109      }
110      return;
111    } catch (FileNotFoundException fnfe) {
112      LOG.warn("HFile " + p + " was missing.  Likely removed due to compaction/split?");
113      missing.add(p);
114    } finally {
115      hfilesChecked.addAndGet(1);
116      if (r != null) {
117        r.close(true);
118      }
119    }
120  }
121
122  /**
123   * Given a path, generates a new path to where we move a corrupted hfile (bad trailer, no
124   * trailer). n * Path to a corrupt hfile (assumes that it is HBASE_DIR/ table /region/cf/file)
125   * @return path to where corrupted files are stored. This should be
126   *         HBASE_DIR/.corrupt/table/region/cf/file.
127   */
128  Path createQuarantinePath(Path hFile) throws IOException {
129    // extract the normal dirs structure
130    Path cfDir = hFile.getParent();
131    Path regionDir = cfDir.getParent();
132    Path tableDir = regionDir.getParent();
133
134    // build up the corrupted dirs structure
135    Path corruptBaseDir = new Path(CommonFSUtils.getRootDir(conf), HConstants.CORRUPT_DIR_NAME);
136    if (conf.get("hbase.hfile.quarantine.dir") != null) {
137      LOG.warn("hbase.hfile.quarantine.dir is deprecated. Default to " + corruptBaseDir);
138    }
139    Path corruptTableDir = new Path(corruptBaseDir, tableDir.getName());
140    Path corruptRegionDir = new Path(corruptTableDir, regionDir.getName());
141    Path corruptFamilyDir = new Path(corruptRegionDir, cfDir.getName());
142    Path corruptHfile = new Path(corruptFamilyDir, hFile.getName());
143    return corruptHfile;
144  }
145
146  /**
147   * Check all files in a column family dir. n * column family directory n
148   */
149  protected void checkColFamDir(Path cfDir) throws IOException {
150    FileStatus[] statuses = null;
151    try {
152      statuses = fs.listStatus(cfDir); // use same filter as scanner.
153    } catch (FileNotFoundException fnfe) {
154      // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist.
155      LOG.warn("Colfam Directory " + cfDir
156        + " does not exist.  Likely due to concurrent split/compaction. Skipping.");
157      missing.add(cfDir);
158      return;
159    }
160
161    List<FileStatus> hfs = FSUtils.filterFileStatuses(statuses, new HFileFilter(fs));
162    // Hadoop 1.0 listStatus does not throw an exception if the path does not exist.
163    if (hfs.isEmpty() && !fs.exists(cfDir)) {
164      LOG.warn("Colfam Directory " + cfDir
165        + " does not exist.  Likely due to concurrent split/compaction. Skipping.");
166      missing.add(cfDir);
167      return;
168    }
169
170    LOG.info("Checking Column Family Directory {}. Number of entries = {}", cfDir, hfs.size());
171
172    for (FileStatus hfFs : hfs) {
173      Path hf = hfFs.getPath();
174      checkHFile(hf);
175    }
176  }
177
178  /**
179   * Check all files in a mob column family dir. n * mob column family directory n
180   */
181  protected void checkMobColFamDir(Path cfDir) throws IOException {
182    FileStatus[] statuses = null;
183    try {
184      statuses = fs.listStatus(cfDir); // use same filter as scanner.
185    } catch (FileNotFoundException fnfe) {
186      // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist.
187      LOG.warn("Mob colfam Directory " + cfDir
188        + " does not exist.  Likely the table is deleted. Skipping.");
189      missedMobFiles.add(cfDir);
190      return;
191    }
192
193    List<FileStatus> hfs = FSUtils.filterFileStatuses(statuses, new HFileFilter(fs));
194    // Hadoop 1.0 listStatus does not throw an exception if the path does not exist.
195    if (hfs.isEmpty() && !fs.exists(cfDir)) {
196      LOG.warn("Mob colfam Directory " + cfDir
197        + " does not exist.  Likely the table is deleted. Skipping.");
198      missedMobFiles.add(cfDir);
199      return;
200    }
201
202    LOG.info("Checking MOB Column Family Directory {}. Number of entries = {}", cfDir, hfs.size());
203
204    for (FileStatus hfFs : hfs) {
205      Path hf = hfFs.getPath();
206      checkMobFile(hf);
207    }
208  }
209
210  /**
211   * Checks a path to see if it is a valid mob file. n * full Path to a mob file. n * This is a
212   * connectivity related exception
213   */
214  protected void checkMobFile(Path p) throws IOException {
215    HFile.Reader r = null;
216    try {
217      r = HFile.createReader(fs, p, cacheConf, true, conf);
218    } catch (CorruptHFileException che) {
219      LOG.warn("Found corrupt mob file " + p, che);
220      corruptedMobFiles.add(p);
221      if (inQuarantineMode) {
222        Path dest = createQuarantinePath(p);
223        LOG.warn("Quarantining corrupt mob file " + p + " into " + dest);
224        boolean success = fs.mkdirs(dest.getParent());
225        success = success ? fs.rename(p, dest) : false;
226        if (!success) {
227          failureMobFiles.add(p);
228        } else {
229          quarantinedMobFiles.add(dest);
230        }
231      }
232      return;
233    } catch (FileNotFoundException fnfe) {
234      LOG.warn("Mob file " + p + " was missing.  Likely removed due to compaction?");
235      missedMobFiles.add(p);
236    } finally {
237      mobFilesChecked.addAndGet(1);
238      if (r != null) {
239        r.close(true);
240      }
241    }
242  }
243
244  /**
245   * Checks all the mob files of a table.
246   * @param regionDir The mob region directory n
247   */
248  private void checkMobRegionDir(Path regionDir) throws IOException {
249    if (!fs.exists(regionDir)) {
250      return;
251    }
252    FileStatus[] hfs = null;
253    try {
254      hfs = fs.listStatus(regionDir, new FamilyDirFilter(fs));
255    } catch (FileNotFoundException fnfe) {
256      // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist.
257      LOG.warn(
258        "Mob directory " + regionDir + " does not exist.  Likely the table is deleted. Skipping.");
259      missedMobFiles.add(regionDir);
260      return;
261    }
262
263    // Hadoop 1.0 listStatus does not throw an exception if the path does not exist.
264    if (hfs.length == 0 && !fs.exists(regionDir)) {
265      LOG.warn(
266        "Mob directory " + regionDir + " does not exist.  Likely the table is deleted. Skipping.");
267      missedMobFiles.add(regionDir);
268      return;
269    }
270
271    LOG.info("Checking MOB Region Directory {}. Number of entries = {}", regionDir, hfs.length);
272
273    for (FileStatus hfFs : hfs) {
274      Path hf = hfFs.getPath();
275      checkMobColFamDir(hf);
276    }
277  }
278
279  /**
280   * Check all column families in a region dir. n * region directory n
281   */
282  protected void checkRegionDir(Path regionDir) throws IOException {
283    FileStatus[] statuses = null;
284    try {
285      statuses = fs.listStatus(regionDir);
286    } catch (FileNotFoundException fnfe) {
287      // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist.
288      LOG.warn("Region Directory " + regionDir
289        + " does not exist.  Likely due to concurrent split/compaction. Skipping.");
290      missing.add(regionDir);
291      return;
292    }
293
294    List<FileStatus> cfs = FSUtils.filterFileStatuses(statuses, new FamilyDirFilter(fs));
295    // Hadoop 1.0 listStatus does not throw an exception if the path does not exist.
296    if (cfs.isEmpty() && !fs.exists(regionDir)) {
297      LOG.warn("Region Directory " + regionDir
298        + " does not exist.  Likely due to concurrent split/compaction. Skipping.");
299      missing.add(regionDir);
300      return;
301    }
302
303    LOG.info("Checking Region Directory {}. Number of entries = {}", regionDir, cfs.size());
304
305    for (FileStatus cfFs : cfs) {
306      Path cfDir = cfFs.getPath();
307      checkColFamDir(cfDir);
308    }
309  }
310
311  /**
312   * Check all the regiondirs in the specified tableDir n * path to a table n
313   */
314  void checkTableDir(Path tableDir) throws IOException {
315    List<FileStatus> rds =
316      FSUtils.listStatusWithStatusFilter(fs, tableDir, new RegionDirFilter(fs));
317    if (rds == null) {
318      if (!fs.exists(tableDir)) {
319        LOG.warn("Table Directory " + tableDir
320          + " does not exist.  Likely due to concurrent delete. Skipping.");
321        missing.add(tableDir);
322      }
323      return;
324    }
325
326    LOG.info("Checking Table Directory {}. Number of entries (including mob) = {}", tableDir,
327      rds.size() + 1);
328
329    // Parallelize check at the region dir level
330    List<RegionDirChecker> rdcs = new ArrayList<>(rds.size() + 1);
331    List<Future<Void>> rdFutures;
332
333    for (FileStatus rdFs : rds) {
334      Path rdDir = rdFs.getPath();
335      RegionDirChecker work = new RegionDirChecker(rdDir);
336      rdcs.add(work);
337    }
338
339    // add mob region
340    rdcs.add(createMobRegionDirChecker(tableDir));
341    // Submit and wait for completion
342    try {
343      rdFutures = executor.invokeAll(rdcs);
344    } catch (InterruptedException ie) {
345      Thread.currentThread().interrupt();
346      LOG.warn("Region dirs checking interrupted!", ie);
347      return;
348    }
349
350    for (int i = 0; i < rdFutures.size(); i++) {
351      Future<Void> f = rdFutures.get(i);
352      try {
353        f.get();
354      } catch (ExecutionException e) {
355        LOG.warn("Failed to quarantine an HFile in regiondir " + rdcs.get(i).regionDir,
356          e.getCause());
357        // rethrow IOExceptions
358        if (e.getCause() instanceof IOException) {
359          throw (IOException) e.getCause();
360        }
361
362        // rethrow RuntimeExceptions
363        if (e.getCause() instanceof RuntimeException) {
364          throw (RuntimeException) e.getCause();
365        }
366
367        // this should never happen
368        LOG.error("Unexpected exception encountered", e);
369        return; // bailing out.
370      } catch (InterruptedException ie) {
371        Thread.currentThread().interrupt();
372        LOG.warn("Region dirs check interrupted!", ie);
373        // bailing out
374        return;
375      }
376    }
377  }
378
379  /**
380   * An individual work item for parallelized regiondir processing. This is intentionally an inner
381   * class so it can use the shared error sets and fs.
382   */
383  private class RegionDirChecker implements Callable<Void> {
384    final Path regionDir;
385
386    RegionDirChecker(Path regionDir) {
387      this.regionDir = regionDir;
388    }
389
390    @Override
391    public Void call() throws IOException {
392      checkRegionDir(regionDir);
393      return null;
394    }
395  }
396
397  /**
398   * An individual work item for parallelized mob dir processing. This is intentionally an inner
399   * class so it can use the shared error sets and fs.
400   */
401  private class MobRegionDirChecker extends RegionDirChecker {
402
403    MobRegionDirChecker(Path regionDir) {
404      super(regionDir);
405    }
406
407    @Override
408    public Void call() throws IOException {
409      checkMobRegionDir(regionDir);
410      return null;
411    }
412  }
413
414  /**
415   * Creates an instance of MobRegionDirChecker.
416   * @param tableDir The current table directory.
417   * @return An instance of MobRegionDirChecker.
418   */
419  private MobRegionDirChecker createMobRegionDirChecker(Path tableDir) {
420    TableName tableName = CommonFSUtils.getTableName(tableDir);
421    Path mobDir = MobUtils.getMobRegionPath(conf, tableName);
422    return new MobRegionDirChecker(mobDir);
423  }
424
425  /**
426   * Check the specified table dirs for bad hfiles.
427   */
428  public void checkTables(Collection<Path> tables) throws IOException {
429    for (Path t : tables) {
430      checkTableDir(t);
431    }
432  }
433
434  /** Returns the set of check failure file paths after checkTables is called. */
435  public Collection<Path> getFailures() {
436    return new HashSet<>(failures);
437  }
438
439  /** Returns the set of corrupted file paths after checkTables is called. */
440  public Collection<Path> getCorrupted() {
441    return new HashSet<>(corrupted);
442  }
443
444  /** Returns number of hfiles checked in the last HfileCorruptionChecker run */
445  public int getHFilesChecked() {
446    return hfilesChecked.get();
447  }
448
449  /** Returns the set of successfully quarantined paths after checkTables is called. */
450  public Collection<Path> getQuarantined() {
451    return new HashSet<>(quarantined);
452  }
453
454  /**
455   * @return the set of paths that were missing. Likely due to deletion/moves from compaction or
456   *         flushes.
457   */
458  public Collection<Path> getMissing() {
459    return new HashSet<>(missing);
460  }
461
462  /** Returns the set of check failure mob file paths after checkTables is called. */
463  public Collection<Path> getFailureMobFiles() {
464    return new HashSet<>(failureMobFiles);
465  }
466
467  /** Returns the set of corrupted mob file paths after checkTables is called. */
468  public Collection<Path> getCorruptedMobFiles() {
469    return new HashSet<>(corruptedMobFiles);
470  }
471
472  /** Returns number of mob files checked in the last HfileCorruptionChecker run */
473  public int getMobFilesChecked() {
474    return mobFilesChecked.get();
475  }
476
477  /** Returns the set of successfully quarantined paths after checkTables is called. */
478  public Collection<Path> getQuarantinedMobFiles() {
479    return new HashSet<>(quarantinedMobFiles);
480  }
481
482  /**
483   * @return the set of paths that were missing. Likely due to table deletion or deletion/moves from
484   *         compaction.
485   */
486  public Collection<Path> getMissedMobFiles() {
487    return new HashSet<>(missedMobFiles);
488  }
489
490  /**
491   * Print a human readable summary of hfile quarantining operations. n
492   */
493  public void report(HbckErrorReporter out) {
494    out.print("Checked " + hfilesChecked.get() + " hfile for corruption");
495    out.print("  HFiles corrupted:                  " + corrupted.size());
496    if (inQuarantineMode) {
497      out.print("    HFiles successfully quarantined: " + quarantined.size());
498      for (Path sq : quarantined) {
499        out.print("      " + sq);
500      }
501      out.print("    HFiles failed quarantine:        " + failures.size());
502      for (Path fq : failures) {
503        out.print("      " + fq);
504      }
505    }
506    out.print("    HFiles moved while checking:     " + missing.size());
507    for (Path mq : missing) {
508      out.print("      " + mq);
509    }
510
511    String initialState = (corrupted.isEmpty()) ? "OK" : "CORRUPTED";
512    String fixedState = (corrupted.size() == quarantined.size()) ? "OK" : "CORRUPTED";
513
514    // print mob-related report
515    out.print("Checked " + mobFilesChecked.get() + " Mob files for corruption");
516    out.print("  Mob files corrupted:                  " + corruptedMobFiles.size());
517    if (inQuarantineMode) {
518      out.print("    Mob files successfully quarantined: " + quarantinedMobFiles.size());
519      for (Path sq : quarantinedMobFiles) {
520        out.print("      " + sq);
521      }
522      out.print("    Mob files failed quarantine:        " + failureMobFiles.size());
523      for (Path fq : failureMobFiles) {
524        out.print("      " + fq);
525      }
526    }
527    out.print("    Mob files moved while checking:     " + missedMobFiles.size());
528    for (Path mq : missedMobFiles) {
529      out.print("      " + mq);
530    }
531    String initialMobState = (corruptedMobFiles.isEmpty()) ? "OK" : "CORRUPTED";
532    String fixedMobState =
533      (corruptedMobFiles.size() == quarantinedMobFiles.size()) ? "OK" : "CORRUPTED";
534
535    if (inQuarantineMode) {
536      out.print("Summary: " + initialState + " => " + fixedState);
537      out.print("Mob summary: " + initialMobState + " => " + fixedMobState);
538    } else {
539      out.print("Summary: " + initialState);
540      out.print("Mob summary: " + initialMobState);
541    }
542  }
543}