001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import static org.junit.jupiter.api.Assertions.assertEquals;
021import static org.junit.jupiter.api.Assertions.fail;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.EnumSet;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030import java.util.Optional;
031import java.util.concurrent.CountDownLatch;
032import java.util.concurrent.ExecutorService;
033import java.util.concurrent.ScheduledThreadPoolExecutor;
034import org.apache.hadoop.conf.Configuration;
035import org.apache.hadoop.fs.FileStatus;
036import org.apache.hadoop.fs.FileSystem;
037import org.apache.hadoop.fs.Path;
038import org.apache.hadoop.hbase.ClusterMetrics;
039import org.apache.hadoop.hbase.ClusterMetrics.Option;
040import org.apache.hadoop.hbase.HBaseTestingUtil;
041import org.apache.hadoop.hbase.HConstants;
042import org.apache.hadoop.hbase.HRegionLocation;
043import org.apache.hadoop.hbase.MetaTableAccessor;
044import org.apache.hadoop.hbase.ServerName;
045import org.apache.hadoop.hbase.TableName;
046import org.apache.hadoop.hbase.client.Admin;
047import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
048import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
049import org.apache.hadoop.hbase.client.Connection;
050import org.apache.hadoop.hbase.client.ConnectionFactory;
051import org.apache.hadoop.hbase.client.Delete;
052import org.apache.hadoop.hbase.client.Put;
053import org.apache.hadoop.hbase.client.RegionInfo;
054import org.apache.hadoop.hbase.client.RegionLocator;
055import org.apache.hadoop.hbase.client.Scan;
056import org.apache.hadoop.hbase.client.Table;
057import org.apache.hadoop.hbase.client.TableDescriptor;
058import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
059import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor;
060import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
061import org.apache.hadoop.hbase.coprocessor.MasterObserver;
062import org.apache.hadoop.hbase.coprocessor.ObserverContext;
063import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
064import org.apache.hadoop.hbase.master.assignment.RegionStates;
065import org.apache.hadoop.hbase.mob.MobFileName;
066import org.apache.hadoop.hbase.mob.MobUtils;
067import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
068import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
069import org.slf4j.Logger;
070import org.slf4j.LoggerFactory;
071
072/**
073 * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables. Actual
074 * tests are in : TestHBaseFsckTwoRS TestHBaseFsckOneRS TestHBaseFsckMOB TestHBaseFsckReplicas
075 */
076public class BaseTestHBaseFsck {
077  static final int POOL_SIZE = 7;
078  protected static final Logger LOG = LoggerFactory.getLogger(BaseTestHBaseFsck.class);
079  protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
080  protected final static Configuration conf = TEST_UTIL.getConfiguration();
081  protected final static String FAM_STR = "fam";
082  protected final static byte[] FAM = Bytes.toBytes(FAM_STR);
083  protected final static int REGION_ONLINE_TIMEOUT = 800;
084  protected static AssignmentManager assignmentManager;
085  protected static RegionStates regionStates;
086  protected static ExecutorService tableExecutorService;
087  protected static ScheduledThreadPoolExecutor hbfsckExecutorService;
088  protected static Connection connection;
089  protected static Admin admin;
090
091  // for the instance, reset every test run
092  protected Table tbl;
093  protected final static byte[][] SPLITS =
094    new byte[][] { Bytes.toBytes("A"), Bytes.toBytes("B"), Bytes.toBytes("C") };
095  // one row per region.
096  protected final static byte[][] ROWKEYS = new byte[][] { Bytes.toBytes("00"), Bytes.toBytes("50"),
097    Bytes.toBytes("A0"), Bytes.toBytes("A5"), Bytes.toBytes("B0"), Bytes.toBytes("B5"),
098    Bytes.toBytes("C0"), Bytes.toBytes("C5") };
099
100  /**
101   * Debugging method to dump the contents of meta.
102   */
103  protected void dumpMeta(TableName tableName) throws IOException {
104    List<RegionInfo> regions =
105      MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), tableName);
106    for (RegionInfo region : regions) {
107      LOG.info(region.getRegionNameAsString());
108    }
109  }
110
111  /**
112   * This method is used to undeploy a region -- close it and attempt to remove its state from the
113   * Master.
114   */
115  protected void undeployRegion(Connection conn, ServerName sn, RegionInfo hri)
116    throws IOException, InterruptedException {
117    try {
118      HBaseFsckRepair.closeRegionSilentlyAndWait(conn, sn, hri);
119      if (!hri.isMetaRegion()) {
120        admin.offline(hri.getRegionName());
121      }
122    } catch (IOException ioe) {
123      LOG.warn(
124        "Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()),
125        ioe);
126    }
127  }
128
129  /**
130   * Delete a region from assignments, meta, or completely from hdfs.
131   * @param unassign if true unassign region if assigned
132   * @param metaRow  if true remove region's row from META
133   * @param hdfs     if true remove region's dir in HDFS
134   */
135  protected void deleteRegion(Configuration conf, final TableDescriptor htd, byte[] startKey,
136    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs)
137    throws IOException, InterruptedException {
138    deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false,
139      RegionInfo.DEFAULT_REPLICA_ID);
140  }
141
142  /**
143   * Delete a region from assignments, meta, or completely from hdfs.
144   * @param unassign       if true unassign region if assigned
145   * @param metaRow        if true remove region's row from META
146   * @param hdfs           if true remove region's dir in HDFS
147   * @param regionInfoOnly if true remove a region dir's .regioninfo file
148   * @param replicaId      replica id
149   */
150  protected void deleteRegion(Configuration conf, final TableDescriptor htd, byte[] startKey,
151    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs, boolean regionInfoOnly,
152    int replicaId) throws IOException, InterruptedException {
153    LOG.info("** Before delete:");
154    dumpMeta(htd.getTableName());
155
156    List<HRegionLocation> locations;
157    try (RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
158      locations = rl.getAllRegionLocations();
159    }
160
161    for (HRegionLocation location : locations) {
162      RegionInfo hri = location.getRegion();
163      ServerName hsa = location.getServerName();
164      if (
165        Bytes.compareTo(hri.getStartKey(), startKey) == 0
166          && Bytes.compareTo(hri.getEndKey(), endKey) == 0 && hri.getReplicaId() == replicaId
167      ) {
168
169        LOG.info("RegionName: " + hri.getRegionNameAsString());
170        byte[] deleteRow = hri.getRegionName();
171
172        if (unassign) {
173          LOG.info("Undeploying region " + hri + " from server " + hsa);
174          undeployRegion(connection, hsa, hri);
175        }
176
177        if (regionInfoOnly) {
178          LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
179          Path rootDir = CommonFSUtils.getRootDir(conf);
180          FileSystem fs = rootDir.getFileSystem(conf);
181          Path p =
182            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
183          Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
184          fs.delete(hriPath, true);
185        }
186
187        if (hdfs) {
188          LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
189          Path rootDir = CommonFSUtils.getRootDir(conf);
190          FileSystem fs = rootDir.getFileSystem(conf);
191          Path p =
192            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
193          HBaseFsck.debugLsr(conf, p);
194          boolean success = fs.delete(p, true);
195          LOG.info("Deleted " + p + " sucessfully? " + success);
196          HBaseFsck.debugLsr(conf, p);
197        }
198
199        if (metaRow) {
200          try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
201            Delete delete = new Delete(deleteRow);
202            meta.delete(delete);
203          }
204        }
205      }
206      LOG.info(hri.toString() + hsa.toString());
207    }
208
209    LOG.info("*** After delete:");
210    dumpMeta(htd.getTableName());
211  }
212
213  /**
214   * Setup a clean table before we start mucking with it. It will set tbl which needs to be closed
215   * after test
216   */
217  void setupTable(TableName tablename) throws Exception {
218    setupTableWithRegionReplica(tablename, 1);
219  }
220
221  /**
222   * Setup a clean table with a certain region_replica count It will set tbl which needs to be
223   * closed after test
224   */
225  void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
226    TableDescriptorBuilder tableDescriptorBuilder = TableDescriptorBuilder.newBuilder(tablename);
227    ColumnFamilyDescriptor columnFamilyDescriptor =
228      ColumnFamilyDescriptorBuilder.newBuilder(FAM).build();
229    tableDescriptorBuilder.setRegionReplication(replicaCount);
230    // If a table has no CF's it doesn't get checked
231    tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
232    createTable(TEST_UTIL, tableDescriptorBuilder.build(), SPLITS);
233
234    tbl = connection.getTable(tablename, tableExecutorService);
235    List<Put> puts = new ArrayList<>(ROWKEYS.length);
236    for (byte[] row : ROWKEYS) {
237      Put p = new Put(row);
238      p.addColumn(FAM, Bytes.toBytes("val"), row);
239      puts.add(p);
240    }
241    tbl.put(puts);
242  }
243
244  /**
245   * Setup a clean table with a mob-enabled column.
246   * @param tablename The name of a table to be created.
247   */
248  void setupMobTable(TableName tablename) throws Exception {
249    TableDescriptorBuilder tableDescriptorBuilder = TableDescriptorBuilder.newBuilder(tablename);
250    ColumnFamilyDescriptor columnFamilyDescriptor =
251      ColumnFamilyDescriptorBuilder.newBuilder(FAM).setMobEnabled(true).setMobThreshold(0).build();
252    // If a table has no CF's it doesn't get checked
253    tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
254    createTable(TEST_UTIL, tableDescriptorBuilder.build(), SPLITS);
255
256    tbl = connection.getTable(tablename, tableExecutorService);
257    List<Put> puts = new ArrayList<>(ROWKEYS.length);
258    for (byte[] row : ROWKEYS) {
259      Put p = new Put(row);
260      p.addColumn(FAM, Bytes.toBytes("val"), row);
261      puts.add(p);
262    }
263    tbl.put(puts);
264  }
265
266  /**
267   * Counts the number of rows to verify data loss or non-dataloss.
268   */
269  int countRows() throws IOException {
270    return TEST_UTIL.countRows(tbl);
271  }
272
273  /**
274   * Counts the number of rows to verify data loss or non-dataloss.
275   */
276  int countRows(byte[] start, byte[] end) throws IOException {
277    return TEST_UTIL.countRows(tbl, new Scan().withStartRow(start).withStopRow(end));
278  }
279
280  /**
281   * delete table in preparation for next test
282   */
283  void cleanupTable(TableName tablename) throws Exception {
284    if (tbl != null) {
285      tbl.close();
286      tbl = null;
287    }
288    connection.clearRegionLocationCache();
289    deleteTable(TEST_UTIL, tablename);
290  }
291
292  /**
293   * Get region info from local cluster.
294   */
295  Map<ServerName, List<String>> getDeployedHRIs(final Admin admin) throws IOException {
296    ClusterMetrics status = admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS));
297    Collection<ServerName> regionServers = status.getLiveServerMetrics().keySet();
298    Map<ServerName, List<String>> mm = new HashMap<>();
299    for (ServerName hsi : regionServers) {
300      // list all online regions from this region server
301      List<RegionInfo> regions = admin.getRegions(hsi);
302      List<String> regionNames = new ArrayList<>(regions.size());
303      for (RegionInfo hri : regions) {
304        regionNames.add(hri.getRegionNameAsString());
305      }
306      mm.put(hsi, regionNames);
307    }
308    return mm;
309  }
310
311  /**
312   * Returns the HSI a region info is on.
313   */
314  ServerName findDeployedHSI(Map<ServerName, List<String>> mm, RegionInfo hri) {
315    for (Map.Entry<ServerName, List<String>> e : mm.entrySet()) {
316      if (e.getValue().contains(hri.getRegionNameAsString())) {
317        return e.getKey();
318      }
319    }
320    return null;
321  }
322
323  public void deleteTableDir(TableName table) throws IOException {
324    Path rootDir = CommonFSUtils.getRootDir(conf);
325    FileSystem fs = rootDir.getFileSystem(conf);
326    Path p = CommonFSUtils.getTableDir(rootDir, table);
327    HBaseFsck.debugLsr(conf, p);
328    boolean success = fs.delete(p, true);
329    LOG.info("Deleted " + p + " sucessfully? " + success);
330  }
331
332  /**
333   * We don't have an easy way to verify that a flush completed, so we loop until we find a
334   * legitimate hfile and return it.
335   * @return Path of a flushed hfile.
336   */
337  Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
338    Path tableDir = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), table);
339    Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
340    Path famDir = new Path(regionDir, FAM_STR);
341
342    // keep doing this until we get a legit hfile
343    while (true) {
344      FileStatus[] hfFss = fs.listStatus(famDir);
345      if (hfFss.length == 0) {
346        continue;
347      }
348      for (FileStatus hfs : hfFss) {
349        if (!hfs.isDirectory()) {
350          return hfs.getPath();
351        }
352      }
353    }
354  }
355
356  /**
357   * Gets flushed mob files.
358   * @param fs    The current file system.
359   * @param table The current table name.
360   * @return Path of a flushed hfile.
361   */
362  Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException {
363    Path famDir = MobUtils.getMobFamilyPath(conf, table, FAM_STR);
364
365    // keep doing this until we get a legit hfile
366    while (true) {
367      FileStatus[] hfFss = fs.listStatus(famDir);
368      if (hfFss.length == 0) {
369        continue;
370      }
371      for (FileStatus hfs : hfFss) {
372        if (!hfs.isDirectory()) {
373          return hfs.getPath();
374        }
375      }
376    }
377  }
378
379  /**
380   * Creates a new mob file name by the old one.
381   * @param oldFileName The old mob file name.
382   * @return The new mob file name.
383   */
384  String createMobFileName(String oldFileName) {
385    MobFileName mobFileName = MobFileName.create(oldFileName);
386    String startKey = mobFileName.getStartKey();
387    String date = mobFileName.getDate();
388    return MobFileName
389      .create(startKey, date, TEST_UTIL.getRandomUUID().toString().replaceAll("-", ""), "abcdef")
390      .getFileName();
391  }
392
393  /**
394   * Test that use this should have a timeout, because this method could potentially wait forever.
395   */
396  protected void doQuarantineTest(TableName table, HBaseFsck hbck, int check, int corrupt, int fail,
397    int quar, int missing) throws Exception {
398    try {
399      setupTable(table);
400      assertEquals(ROWKEYS.length, countRows());
401      admin.flush(table); // flush is async.
402
403      // Mess it up by leaving a hole in the assignment, meta, and hdfs data
404      admin.disableTable(table);
405
406      String[] args = { "-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
407        table.getNameAsString() };
408      HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
409
410      HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
411      assertEquals(hfcc.getHFilesChecked(), check);
412      assertEquals(hfcc.getCorrupted().size(), corrupt);
413      assertEquals(hfcc.getFailures().size(), fail);
414      assertEquals(hfcc.getQuarantined().size(), quar);
415      assertEquals(hfcc.getMissing().size(), missing);
416
417      // its been fixed, verify that we can enable
418      admin.enableTableAsync(table);
419      while (!admin.isTableEnabled(table)) {
420        try {
421          Thread.sleep(250);
422        } catch (InterruptedException e) {
423          e.printStackTrace();
424          fail("Interrupted when trying to enable table " + table);
425        }
426      }
427    } finally {
428      cleanupTable(table);
429    }
430  }
431
432  static class MockErrorReporter implements HbckErrorReporter {
433    static int calledCount = 0;
434
435    @Override
436    public void clear() {
437      calledCount++;
438    }
439
440    @Override
441    public void report(String message) {
442      calledCount++;
443    }
444
445    @Override
446    public void reportError(String message) {
447      calledCount++;
448    }
449
450    @Override
451    public void reportError(ERROR_CODE errorCode, String message) {
452      calledCount++;
453    }
454
455    @Override
456    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table) {
457      calledCount++;
458    }
459
460    @Override
461    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
462      HbckRegionInfo info) {
463      calledCount++;
464    }
465
466    @Override
467    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
468      HbckRegionInfo info1, HbckRegionInfo info2) {
469      calledCount++;
470    }
471
472    @Override
473    public int summarize() {
474      return ++calledCount;
475    }
476
477    @Override
478    public void detail(String details) {
479      calledCount++;
480    }
481
482    @Override
483    public ArrayList<ERROR_CODE> getErrorList() {
484      calledCount++;
485      return new ArrayList<>();
486    }
487
488    @Override
489    public void progress() {
490      calledCount++;
491    }
492
493    @Override
494    public void print(String message) {
495      calledCount++;
496    }
497
498    @Override
499    public void resetErrors() {
500      calledCount++;
501    }
502
503    @Override
504    public boolean tableHasErrors(HbckTableInfo table) {
505      calledCount++;
506      return false;
507    }
508  }
509
510  protected void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
511    boolean regionInfoOnly) throws IOException, InterruptedException {
512    HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
513      .getRegionLocation(HConstants.EMPTY_START_ROW);
514    ServerName hsa = metaLocation.getServerName();
515    RegionInfo hri = metaLocation.getRegion();
516    if (unassign) {
517      LOG.info("Undeploying meta region " + hri + " from server " + hsa);
518      try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
519        undeployRegion(unmanagedConnection, hsa, hri);
520      }
521    }
522
523    if (regionInfoOnly) {
524      LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
525      Path rootDir = CommonFSUtils.getRootDir(conf);
526      FileSystem fs = rootDir.getFileSystem(conf);
527      Path p =
528        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
529      Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
530      fs.delete(hriPath, true);
531    }
532
533    if (hdfs) {
534      LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
535      Path rootDir = CommonFSUtils.getRootDir(conf);
536      FileSystem fs = rootDir.getFileSystem(conf);
537      Path p =
538        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
539      HBaseFsck.debugLsr(conf, p);
540      boolean success = fs.delete(p, true);
541      LOG.info("Deleted " + p + " sucessfully? " + success);
542      HBaseFsck.debugLsr(conf, p);
543    }
544  }
545
546  protected String testMethodName;
547
548  public static class MasterSyncCoprocessor implements MasterCoprocessor, MasterObserver {
549    volatile CountDownLatch tableCreationLatch = null;
550    volatile CountDownLatch tableDeletionLatch = null;
551
552    @Override
553    public Optional<MasterObserver> getMasterObserver() {
554      return Optional.of(this);
555    }
556
557    @Override
558    public void postCompletedCreateTableAction(
559      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableDescriptor desc,
560      final RegionInfo[] regions) throws IOException {
561      // the AccessController test, some times calls only and directly the
562      // postCompletedCreateTableAction()
563      if (tableCreationLatch != null) {
564        tableCreationLatch.countDown();
565      }
566    }
567
568    @Override
569    public void postCompletedDeleteTableAction(
570      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableName tableName)
571      throws IOException {
572      // the AccessController test, some times calls only and directly the
573      // postCompletedDeleteTableAction()
574      if (tableDeletionLatch != null) {
575        tableDeletionLatch.countDown();
576      }
577    }
578  }
579
580  public static void createTable(HBaseTestingUtil testUtil, TableDescriptor tableDescriptor,
581    byte[][] splitKeys) throws Exception {
582    // NOTE: We need a latch because admin is not sync,
583    // so the postOp coprocessor method may be called after the admin operation returned.
584    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
585      .findCoprocessor(MasterSyncCoprocessor.class);
586    coproc.tableCreationLatch = new CountDownLatch(1);
587    if (splitKeys != null) {
588      admin.createTable(tableDescriptor, splitKeys);
589    } else {
590      admin.createTable(tableDescriptor);
591    }
592    coproc.tableCreationLatch.await();
593    coproc.tableCreationLatch = null;
594    testUtil.waitUntilAllRegionsAssigned(tableDescriptor.getTableName());
595  }
596
597  public static void deleteTable(HBaseTestingUtil testUtil, TableName tableName) throws Exception {
598    // NOTE: We need a latch because admin is not sync,
599    // so the postOp coprocessor method may be called after the admin operation returned.
600    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
601      .findCoprocessor(MasterSyncCoprocessor.class);
602    coproc.tableDeletionLatch = new CountDownLatch(1);
603    try {
604      admin.disableTable(tableName);
605    } catch (Exception e) {
606      LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
607    }
608    admin.deleteTable(tableName);
609    coproc.tableDeletionLatch.await();
610    coproc.tableDeletionLatch = null;
611  }
612}