001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.fail;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.EnumSet;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030import java.util.Optional;
031import java.util.concurrent.CountDownLatch;
032import java.util.concurrent.ExecutorService;
033import java.util.concurrent.ScheduledThreadPoolExecutor;
034import org.apache.hadoop.conf.Configuration;
035import org.apache.hadoop.fs.FileStatus;
036import org.apache.hadoop.fs.FileSystem;
037import org.apache.hadoop.fs.Path;
038import org.apache.hadoop.hbase.ClusterMetrics;
039import org.apache.hadoop.hbase.ClusterMetrics.Option;
040import org.apache.hadoop.hbase.HBaseTestingUtil;
041import org.apache.hadoop.hbase.HConstants;
042import org.apache.hadoop.hbase.HRegionLocation;
043import org.apache.hadoop.hbase.MetaTableAccessor;
044import org.apache.hadoop.hbase.ServerName;
045import org.apache.hadoop.hbase.TableName;
046import org.apache.hadoop.hbase.client.Admin;
047import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
048import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
049import org.apache.hadoop.hbase.client.Connection;
050import org.apache.hadoop.hbase.client.ConnectionFactory;
051import org.apache.hadoop.hbase.client.Delete;
052import org.apache.hadoop.hbase.client.Put;
053import org.apache.hadoop.hbase.client.RegionInfo;
054import org.apache.hadoop.hbase.client.RegionLocator;
055import org.apache.hadoop.hbase.client.Scan;
056import org.apache.hadoop.hbase.client.Table;
057import org.apache.hadoop.hbase.client.TableDescriptor;
058import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
059import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor;
060import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
061import org.apache.hadoop.hbase.coprocessor.MasterObserver;
062import org.apache.hadoop.hbase.coprocessor.ObserverContext;
063import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
064import org.apache.hadoop.hbase.master.assignment.RegionStates;
065import org.apache.hadoop.hbase.mob.MobFileName;
066import org.apache.hadoop.hbase.mob.MobUtils;
067import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
068import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
069import org.junit.rules.TestName;
070import org.slf4j.Logger;
071import org.slf4j.LoggerFactory;
072
073/**
074 * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables. Actual
075 * tests are in : TestHBaseFsckTwoRS TestHBaseFsckOneRS TestHBaseFsckMOB TestHBaseFsckReplicas
076 */
077public class BaseTestHBaseFsck {
078  static final int POOL_SIZE = 7;
079  protected static final Logger LOG = LoggerFactory.getLogger(BaseTestHBaseFsck.class);
080  protected final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
081  protected final static Configuration conf = TEST_UTIL.getConfiguration();
082  protected final static String FAM_STR = "fam";
083  protected final static byte[] FAM = Bytes.toBytes(FAM_STR);
084  protected final static int REGION_ONLINE_TIMEOUT = 800;
085  protected static AssignmentManager assignmentManager;
086  protected static RegionStates regionStates;
087  protected static ExecutorService tableExecutorService;
088  protected static ScheduledThreadPoolExecutor hbfsckExecutorService;
089  protected static Connection connection;
090  protected static Admin admin;
091
092  // for the instance, reset every test run
093  protected Table tbl;
094  protected final static byte[][] SPLITS =
095    new byte[][] { Bytes.toBytes("A"), Bytes.toBytes("B"), Bytes.toBytes("C") };
096  // one row per region.
097  protected final static byte[][] ROWKEYS = new byte[][] { Bytes.toBytes("00"), Bytes.toBytes("50"),
098    Bytes.toBytes("A0"), Bytes.toBytes("A5"), Bytes.toBytes("B0"), Bytes.toBytes("B5"),
099    Bytes.toBytes("C0"), Bytes.toBytes("C5") };
100
101  /**
102   * Debugging method to dump the contents of meta.
103   */
104  protected void dumpMeta(TableName tableName) throws IOException {
105    List<RegionInfo> regions =
106      MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), tableName);
107    for (RegionInfo region : regions) {
108      LOG.info(region.getRegionNameAsString());
109    }
110  }
111
112  /**
113   * This method is used to undeploy a region -- close it and attempt to remove its state from the
114   * Master.
115   */
116  protected void undeployRegion(Connection conn, ServerName sn, RegionInfo hri)
117    throws IOException, InterruptedException {
118    try {
119      HBaseFsckRepair.closeRegionSilentlyAndWait(conn, sn, hri);
120      if (!hri.isMetaRegion()) {
121        admin.offline(hri.getRegionName());
122      }
123    } catch (IOException ioe) {
124      LOG.warn(
125        "Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()),
126        ioe);
127    }
128  }
129
130  /**
131   * Delete a region from assignments, meta, or completely from hdfs.
132   * @param unassign if true unassign region if assigned
133   * @param metaRow  if true remove region's row from META
134   * @param hdfs     if true remove region's dir in HDFS
135   */
136  protected void deleteRegion(Configuration conf, final TableDescriptor htd, byte[] startKey,
137    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs)
138    throws IOException, InterruptedException {
139    deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false,
140      RegionInfo.DEFAULT_REPLICA_ID);
141  }
142
143  /**
144   * Delete a region from assignments, meta, or completely from hdfs.
145   * @param unassign       if true unassign region if assigned
146   * @param metaRow        if true remove region's row from META
147   * @param hdfs           if true remove region's dir in HDFS
148   * @param regionInfoOnly if true remove a region dir's .regioninfo file
149   * @param replicaId      replica id
150   */
151  protected void deleteRegion(Configuration conf, final TableDescriptor htd, byte[] startKey,
152    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs, boolean regionInfoOnly,
153    int replicaId) throws IOException, InterruptedException {
154    LOG.info("** Before delete:");
155    dumpMeta(htd.getTableName());
156
157    List<HRegionLocation> locations;
158    try (RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
159      locations = rl.getAllRegionLocations();
160    }
161
162    for (HRegionLocation location : locations) {
163      RegionInfo hri = location.getRegion();
164      ServerName hsa = location.getServerName();
165      if (
166        Bytes.compareTo(hri.getStartKey(), startKey) == 0
167          && Bytes.compareTo(hri.getEndKey(), endKey) == 0 && hri.getReplicaId() == replicaId
168      ) {
169
170        LOG.info("RegionName: " + hri.getRegionNameAsString());
171        byte[] deleteRow = hri.getRegionName();
172
173        if (unassign) {
174          LOG.info("Undeploying region " + hri + " from server " + hsa);
175          undeployRegion(connection, hsa, hri);
176        }
177
178        if (regionInfoOnly) {
179          LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
180          Path rootDir = CommonFSUtils.getRootDir(conf);
181          FileSystem fs = rootDir.getFileSystem(conf);
182          Path p =
183            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
184          Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
185          fs.delete(hriPath, true);
186        }
187
188        if (hdfs) {
189          LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
190          Path rootDir = CommonFSUtils.getRootDir(conf);
191          FileSystem fs = rootDir.getFileSystem(conf);
192          Path p =
193            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
194          HBaseFsck.debugLsr(conf, p);
195          boolean success = fs.delete(p, true);
196          LOG.info("Deleted " + p + " sucessfully? " + success);
197          HBaseFsck.debugLsr(conf, p);
198        }
199
200        if (metaRow) {
201          try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
202            Delete delete = new Delete(deleteRow);
203            meta.delete(delete);
204          }
205        }
206      }
207      LOG.info(hri.toString() + hsa.toString());
208    }
209
210    LOG.info("*** After delete:");
211    dumpMeta(htd.getTableName());
212  }
213
214  /**
215   * Setup a clean table before we start mucking with it. It will set tbl which needs to be closed
216   * after test
217   */
218  void setupTable(TableName tablename) throws Exception {
219    setupTableWithRegionReplica(tablename, 1);
220  }
221
222  /**
223   * Setup a clean table with a certain region_replica count It will set tbl which needs to be
224   * closed after test
225   */
226  void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
227    TableDescriptorBuilder tableDescriptorBuilder = TableDescriptorBuilder.newBuilder(tablename);
228    ColumnFamilyDescriptor columnFamilyDescriptor =
229      ColumnFamilyDescriptorBuilder.newBuilder(FAM).build();
230    tableDescriptorBuilder.setRegionReplication(replicaCount);
231    // If a table has no CF's it doesn't get checked
232    tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
233    createTable(TEST_UTIL, tableDescriptorBuilder.build(), SPLITS);
234
235    tbl = connection.getTable(tablename, tableExecutorService);
236    List<Put> puts = new ArrayList<>(ROWKEYS.length);
237    for (byte[] row : ROWKEYS) {
238      Put p = new Put(row);
239      p.addColumn(FAM, Bytes.toBytes("val"), row);
240      puts.add(p);
241    }
242    tbl.put(puts);
243  }
244
245  /**
246   * Setup a clean table with a mob-enabled column.
247   * @param tablename The name of a table to be created.
248   */
249  void setupMobTable(TableName tablename) throws Exception {
250    TableDescriptorBuilder tableDescriptorBuilder = TableDescriptorBuilder.newBuilder(tablename);
251    ColumnFamilyDescriptor columnFamilyDescriptor =
252      ColumnFamilyDescriptorBuilder.newBuilder(FAM).setMobEnabled(true).setMobThreshold(0).build();
253    // If a table has no CF's it doesn't get checked
254    tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
255    createTable(TEST_UTIL, tableDescriptorBuilder.build(), SPLITS);
256
257    tbl = connection.getTable(tablename, tableExecutorService);
258    List<Put> puts = new ArrayList<>(ROWKEYS.length);
259    for (byte[] row : ROWKEYS) {
260      Put p = new Put(row);
261      p.addColumn(FAM, Bytes.toBytes("val"), row);
262      puts.add(p);
263    }
264    tbl.put(puts);
265  }
266
267  /**
268   * Counts the number of rows to verify data loss or non-dataloss.
269   */
270  int countRows() throws IOException {
271    return TEST_UTIL.countRows(tbl);
272  }
273
274  /**
275   * Counts the number of rows to verify data loss or non-dataloss.
276   */
277  int countRows(byte[] start, byte[] end) throws IOException {
278    return TEST_UTIL.countRows(tbl, new Scan().withStartRow(start).withStopRow(end));
279  }
280
281  /**
282   * delete table in preparation for next test
283   */
284  void cleanupTable(TableName tablename) throws Exception {
285    if (tbl != null) {
286      tbl.close();
287      tbl = null;
288    }
289    connection.clearRegionLocationCache();
290    deleteTable(TEST_UTIL, tablename);
291  }
292
293  /**
294   * Get region info from local cluster.
295   */
296  Map<ServerName, List<String>> getDeployedHRIs(final Admin admin) throws IOException {
297    ClusterMetrics status = admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS));
298    Collection<ServerName> regionServers = status.getLiveServerMetrics().keySet();
299    Map<ServerName, List<String>> mm = new HashMap<>();
300    for (ServerName hsi : regionServers) {
301      // list all online regions from this region server
302      List<RegionInfo> regions = admin.getRegions(hsi);
303      List<String> regionNames = new ArrayList<>(regions.size());
304      for (RegionInfo hri : regions) {
305        regionNames.add(hri.getRegionNameAsString());
306      }
307      mm.put(hsi, regionNames);
308    }
309    return mm;
310  }
311
312  /**
313   * Returns the HSI a region info is on.
314   */
315  ServerName findDeployedHSI(Map<ServerName, List<String>> mm, RegionInfo hri) {
316    for (Map.Entry<ServerName, List<String>> e : mm.entrySet()) {
317      if (e.getValue().contains(hri.getRegionNameAsString())) {
318        return e.getKey();
319      }
320    }
321    return null;
322  }
323
324  public void deleteTableDir(TableName table) throws IOException {
325    Path rootDir = CommonFSUtils.getRootDir(conf);
326    FileSystem fs = rootDir.getFileSystem(conf);
327    Path p = CommonFSUtils.getTableDir(rootDir, table);
328    HBaseFsck.debugLsr(conf, p);
329    boolean success = fs.delete(p, true);
330    LOG.info("Deleted " + p + " sucessfully? " + success);
331  }
332
333  /**
334   * We don't have an easy way to verify that a flush completed, so we loop until we find a
335   * legitimate hfile and return it.
336   * @return Path of a flushed hfile.
337   */
338  Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
339    Path tableDir = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), table);
340    Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
341    Path famDir = new Path(regionDir, FAM_STR);
342
343    // keep doing this until we get a legit hfile
344    while (true) {
345      FileStatus[] hfFss = fs.listStatus(famDir);
346      if (hfFss.length == 0) {
347        continue;
348      }
349      for (FileStatus hfs : hfFss) {
350        if (!hfs.isDirectory()) {
351          return hfs.getPath();
352        }
353      }
354    }
355  }
356
357  /**
358   * Gets flushed mob files.
359   * @param fs    The current file system.
360   * @param table The current table name.
361   * @return Path of a flushed hfile.
362   */
363  Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException {
364    Path famDir = MobUtils.getMobFamilyPath(conf, table, FAM_STR);
365
366    // keep doing this until we get a legit hfile
367    while (true) {
368      FileStatus[] hfFss = fs.listStatus(famDir);
369      if (hfFss.length == 0) {
370        continue;
371      }
372      for (FileStatus hfs : hfFss) {
373        if (!hfs.isDirectory()) {
374          return hfs.getPath();
375        }
376      }
377    }
378  }
379
380  /**
381   * Creates a new mob file name by the old one.
382   * @param oldFileName The old mob file name.
383   * @return The new mob file name.
384   */
385  String createMobFileName(String oldFileName) {
386    MobFileName mobFileName = MobFileName.create(oldFileName);
387    String startKey = mobFileName.getStartKey();
388    String date = mobFileName.getDate();
389    return MobFileName
390      .create(startKey, date, TEST_UTIL.getRandomUUID().toString().replaceAll("-", ""), "abcdef")
391      .getFileName();
392  }
393
394  /**
395   * Test that use this should have a timeout, because this method could potentially wait forever.
396   */
397  protected void doQuarantineTest(TableName table, HBaseFsck hbck, int check, int corrupt, int fail,
398    int quar, int missing) throws Exception {
399    try {
400      setupTable(table);
401      assertEquals(ROWKEYS.length, countRows());
402      admin.flush(table); // flush is async.
403
404      // Mess it up by leaving a hole in the assignment, meta, and hdfs data
405      admin.disableTable(table);
406
407      String[] args = { "-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
408        table.getNameAsString() };
409      HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
410
411      HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
412      assertEquals(hfcc.getHFilesChecked(), check);
413      assertEquals(hfcc.getCorrupted().size(), corrupt);
414      assertEquals(hfcc.getFailures().size(), fail);
415      assertEquals(hfcc.getQuarantined().size(), quar);
416      assertEquals(hfcc.getMissing().size(), missing);
417
418      // its been fixed, verify that we can enable
419      admin.enableTableAsync(table);
420      while (!admin.isTableEnabled(table)) {
421        try {
422          Thread.sleep(250);
423        } catch (InterruptedException e) {
424          e.printStackTrace();
425          fail("Interrupted when trying to enable table " + table);
426        }
427      }
428    } finally {
429      cleanupTable(table);
430    }
431  }
432
433  static class MockErrorReporter implements HbckErrorReporter {
434    static int calledCount = 0;
435
436    @Override
437    public void clear() {
438      calledCount++;
439    }
440
441    @Override
442    public void report(String message) {
443      calledCount++;
444    }
445
446    @Override
447    public void reportError(String message) {
448      calledCount++;
449    }
450
451    @Override
452    public void reportError(ERROR_CODE errorCode, String message) {
453      calledCount++;
454    }
455
456    @Override
457    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table) {
458      calledCount++;
459    }
460
461    @Override
462    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
463      HbckRegionInfo info) {
464      calledCount++;
465    }
466
467    @Override
468    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
469      HbckRegionInfo info1, HbckRegionInfo info2) {
470      calledCount++;
471    }
472
473    @Override
474    public int summarize() {
475      return ++calledCount;
476    }
477
478    @Override
479    public void detail(String details) {
480      calledCount++;
481    }
482
483    @Override
484    public ArrayList<ERROR_CODE> getErrorList() {
485      calledCount++;
486      return new ArrayList<>();
487    }
488
489    @Override
490    public void progress() {
491      calledCount++;
492    }
493
494    @Override
495    public void print(String message) {
496      calledCount++;
497    }
498
499    @Override
500    public void resetErrors() {
501      calledCount++;
502    }
503
504    @Override
505    public boolean tableHasErrors(HbckTableInfo table) {
506      calledCount++;
507      return false;
508    }
509  }
510
511  protected void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
512    boolean regionInfoOnly) throws IOException, InterruptedException {
513    HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
514      .getRegionLocation(HConstants.EMPTY_START_ROW);
515    ServerName hsa = metaLocation.getServerName();
516    RegionInfo hri = metaLocation.getRegion();
517    if (unassign) {
518      LOG.info("Undeploying meta region " + hri + " from server " + hsa);
519      try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
520        undeployRegion(unmanagedConnection, hsa, hri);
521      }
522    }
523
524    if (regionInfoOnly) {
525      LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
526      Path rootDir = CommonFSUtils.getRootDir(conf);
527      FileSystem fs = rootDir.getFileSystem(conf);
528      Path p =
529        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
530      Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
531      fs.delete(hriPath, true);
532    }
533
534    if (hdfs) {
535      LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
536      Path rootDir = CommonFSUtils.getRootDir(conf);
537      FileSystem fs = rootDir.getFileSystem(conf);
538      Path p =
539        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
540      HBaseFsck.debugLsr(conf, p);
541      boolean success = fs.delete(p, true);
542      LOG.info("Deleted " + p + " sucessfully? " + success);
543      HBaseFsck.debugLsr(conf, p);
544    }
545  }
546
547  @org.junit.Rule
548  public TestName name = new TestName();
549
550  public static class MasterSyncCoprocessor implements MasterCoprocessor, MasterObserver {
551    volatile CountDownLatch tableCreationLatch = null;
552    volatile CountDownLatch tableDeletionLatch = null;
553
554    @Override
555    public Optional<MasterObserver> getMasterObserver() {
556      return Optional.of(this);
557    }
558
559    @Override
560    public void postCompletedCreateTableAction(
561      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableDescriptor desc,
562      final RegionInfo[] regions) throws IOException {
563      // the AccessController test, some times calls only and directly the
564      // postCompletedCreateTableAction()
565      if (tableCreationLatch != null) {
566        tableCreationLatch.countDown();
567      }
568    }
569
570    @Override
571    public void postCompletedDeleteTableAction(
572      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableName tableName)
573      throws IOException {
574      // the AccessController test, some times calls only and directly the
575      // postCompletedDeleteTableAction()
576      if (tableDeletionLatch != null) {
577        tableDeletionLatch.countDown();
578      }
579    }
580  }
581
582  public static void createTable(HBaseTestingUtil testUtil, TableDescriptor tableDescriptor,
583    byte[][] splitKeys) throws Exception {
584    // NOTE: We need a latch because admin is not sync,
585    // so the postOp coprocessor method may be called after the admin operation returned.
586    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
587      .findCoprocessor(MasterSyncCoprocessor.class);
588    coproc.tableCreationLatch = new CountDownLatch(1);
589    if (splitKeys != null) {
590      admin.createTable(tableDescriptor, splitKeys);
591    } else {
592      admin.createTable(tableDescriptor);
593    }
594    coproc.tableCreationLatch.await();
595    coproc.tableCreationLatch = null;
596    testUtil.waitUntilAllRegionsAssigned(tableDescriptor.getTableName());
597  }
598
599  public static void deleteTable(HBaseTestingUtil testUtil, TableName tableName) throws Exception {
600    // NOTE: We need a latch because admin is not sync,
601    // so the postOp coprocessor method may be called after the admin operation returned.
602    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
603      .findCoprocessor(MasterSyncCoprocessor.class);
604    coproc.tableDeletionLatch = new CountDownLatch(1);
605    try {
606      admin.disableTable(tableName);
607    } catch (Exception e) {
608      LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
609    }
610    admin.deleteTable(tableName);
611    coproc.tableDeletionLatch.await();
612    coproc.tableDeletionLatch = null;
613  }
614}