001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.fail;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.EnumSet;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030import java.util.Optional;
031import java.util.concurrent.CountDownLatch;
032import java.util.concurrent.ExecutorService;
033import java.util.concurrent.ScheduledThreadPoolExecutor;
034import org.apache.hadoop.conf.Configuration;
035import org.apache.hadoop.fs.FileStatus;
036import org.apache.hadoop.fs.FileSystem;
037import org.apache.hadoop.fs.Path;
038import org.apache.hadoop.hbase.ClusterMetrics;
039import org.apache.hadoop.hbase.ClusterMetrics.Option;
040import org.apache.hadoop.hbase.HBaseTestingUtility;
041import org.apache.hadoop.hbase.HColumnDescriptor;
042import org.apache.hadoop.hbase.HConstants;
043import org.apache.hadoop.hbase.HRegionLocation;
044import org.apache.hadoop.hbase.HTableDescriptor;
045import org.apache.hadoop.hbase.ServerName;
046import org.apache.hadoop.hbase.TableName;
047import org.apache.hadoop.hbase.client.Admin;
048import org.apache.hadoop.hbase.client.ClusterConnection;
049import org.apache.hadoop.hbase.client.Connection;
050import org.apache.hadoop.hbase.client.ConnectionFactory;
051import org.apache.hadoop.hbase.client.Delete;
052import org.apache.hadoop.hbase.client.Put;
053import org.apache.hadoop.hbase.client.RegionInfo;
054import org.apache.hadoop.hbase.client.RegionLocator;
055import org.apache.hadoop.hbase.client.Scan;
056import org.apache.hadoop.hbase.client.Table;
057import org.apache.hadoop.hbase.client.TableDescriptor;
058import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor;
059import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
060import org.apache.hadoop.hbase.coprocessor.MasterObserver;
061import org.apache.hadoop.hbase.coprocessor.ObserverContext;
062import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
063import org.apache.hadoop.hbase.master.assignment.RegionStates;
064import org.apache.hadoop.hbase.mob.MobFileName;
065import org.apache.hadoop.hbase.mob.MobUtils;
066import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
067import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
068import org.junit.rules.TestName;
069import org.slf4j.Logger;
070import org.slf4j.LoggerFactory;
071
072import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
073import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
074
075/**
076 * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables. Actual
077 * tests are in : TestHBaseFsckTwoRS TestHBaseFsckOneRS TestHBaseFsckMOB TestHBaseFsckReplicas
078 */
079public class BaseTestHBaseFsck {
080  static final int POOL_SIZE = 7;
081  protected static final Logger LOG = LoggerFactory.getLogger(BaseTestHBaseFsck.class);
082  protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
083  protected final static Configuration conf = TEST_UTIL.getConfiguration();
084  protected final static String FAM_STR = "fam";
085  protected final static byte[] FAM = Bytes.toBytes(FAM_STR);
086  protected final static int REGION_ONLINE_TIMEOUT = 800;
087  protected static AssignmentManager assignmentManager;
088  protected static RegionStates regionStates;
089  protected static ExecutorService tableExecutorService;
090  protected static ScheduledThreadPoolExecutor hbfsckExecutorService;
091  protected static ClusterConnection connection;
092  protected static Admin admin;
093
094  // for the instance, reset every test run
095  protected Table tbl;
096  protected final static byte[][] SPLITS =
097    new byte[][] { Bytes.toBytes("A"), Bytes.toBytes("B"), Bytes.toBytes("C") };
098  // one row per region.
099  protected final static byte[][] ROWKEYS = new byte[][] { Bytes.toBytes("00"), Bytes.toBytes("50"),
100    Bytes.toBytes("A0"), Bytes.toBytes("A5"), Bytes.toBytes("B0"), Bytes.toBytes("B5"),
101    Bytes.toBytes("C0"), Bytes.toBytes("C5") };
102
103  /**
104   * Debugging method to dump the contents of meta.
105   */
106  protected void dumpMeta(TableName tableName) throws IOException {
107    List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
108    for (byte[] row : metaRows) {
109      LOG.info(Bytes.toString(row));
110    }
111  }
112
113  /**
114   * This method is used to undeploy a region -- close it and attempt to remove its state from the
115   * Master.
116   */
117  protected void undeployRegion(Connection conn, ServerName sn, RegionInfo hri)
118    throws IOException, InterruptedException {
119    try {
120      HBaseFsckRepair.closeRegionSilentlyAndWait(conn, sn, hri);
121      if (!hri.isMetaRegion()) {
122        admin.offline(hri.getRegionName());
123      }
124    } catch (IOException ioe) {
125      LOG.warn(
126        "Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()),
127        ioe);
128    }
129  }
130
131  /**
132   * Delete a region from assignments, meta, or completely from hdfs.
133   * @param unassign if true unassign region if assigned
134   * @param metaRow  if true remove region's row from META
135   * @param hdfs     if true remove region's dir in HDFS
136   */
137  protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey,
138    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs)
139    throws IOException, InterruptedException {
140    deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false,
141      RegionInfo.DEFAULT_REPLICA_ID);
142  }
143
144  /**
145   * Delete a region from assignments, meta, or completely from hdfs.
146   * @param unassign       if true unassign region if assigned
147   * @param metaRow        if true remove region's row from META
148   * @param hdfs           if true remove region's dir in HDFS
149   * @param regionInfoOnly if true remove a region dir's .regioninfo file
150   * @param replicaId      replica id
151   */
152  protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey,
153    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs, boolean regionInfoOnly,
154    int replicaId) throws IOException, InterruptedException {
155    LOG.info("** Before delete:");
156    dumpMeta(htd.getTableName());
157
158    List<HRegionLocation> locations;
159    try (RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
160      locations = rl.getAllRegionLocations();
161    }
162
163    for (HRegionLocation location : locations) {
164      RegionInfo hri = location.getRegionInfo();
165      ServerName hsa = location.getServerName();
166      if (
167        Bytes.compareTo(hri.getStartKey(), startKey) == 0
168          && Bytes.compareTo(hri.getEndKey(), endKey) == 0 && hri.getReplicaId() == replicaId
169      ) {
170
171        LOG.info("RegionName: " + hri.getRegionNameAsString());
172        byte[] deleteRow = hri.getRegionName();
173
174        if (unassign) {
175          LOG.info("Undeploying region " + hri + " from server " + hsa);
176          undeployRegion(connection, hsa, hri);
177        }
178
179        if (regionInfoOnly) {
180          LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
181          Path rootDir = CommonFSUtils.getRootDir(conf);
182          FileSystem fs = rootDir.getFileSystem(conf);
183          Path p =
184            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
185          Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
186          fs.delete(hriPath, true);
187        }
188
189        if (hdfs) {
190          LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
191          Path rootDir = CommonFSUtils.getRootDir(conf);
192          FileSystem fs = rootDir.getFileSystem(conf);
193          Path p =
194            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
195          HBaseFsck.debugLsr(conf, p);
196          boolean success = fs.delete(p, true);
197          LOG.info("Deleted " + p + " sucessfully? " + success);
198          HBaseFsck.debugLsr(conf, p);
199        }
200
201        if (metaRow) {
202          try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
203            Delete delete = new Delete(deleteRow);
204            meta.delete(delete);
205          }
206        }
207      }
208      LOG.info(hri.toString() + hsa.toString());
209    }
210
211    TEST_UTIL.getMetaTableRows(htd.getTableName());
212    LOG.info("*** After delete:");
213    dumpMeta(htd.getTableName());
214  }
215
216  /**
217   * Setup a clean table before we start mucking with it. It will set tbl which needs to be closed
218   * after test nnn
219   */
220  void setupTable(TableName tablename) throws Exception {
221    setupTableWithRegionReplica(tablename, 1);
222  }
223
224  /**
225   * Setup a clean table with a certain region_replica count It will set tbl which needs to be
226   * closed after test n
227   */
228  void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
229    HTableDescriptor desc = new HTableDescriptor(tablename);
230    desc.setRegionReplication(replicaCount);
231    HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
232    desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
233    createTable(TEST_UTIL, desc, SPLITS);
234
235    tbl = connection.getTable(tablename, tableExecutorService);
236    List<Put> puts = new ArrayList<>(ROWKEYS.length);
237    for (byte[] row : ROWKEYS) {
238      Put p = new Put(row);
239      p.addColumn(FAM, Bytes.toBytes("val"), row);
240      puts.add(p);
241    }
242    tbl.put(puts);
243  }
244
245  /**
246   * Setup a clean table with a mob-enabled column.
247   * @param tablename The name of a table to be created. n
248   */
249  void setupMobTable(TableName tablename) throws Exception {
250    HTableDescriptor desc = new HTableDescriptor(tablename);
251    HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
252    hcd.setMobEnabled(true);
253    hcd.setMobThreshold(0);
254    desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
255    createTable(TEST_UTIL, desc, SPLITS);
256
257    tbl = connection.getTable(tablename, tableExecutorService);
258    List<Put> puts = new ArrayList<>(ROWKEYS.length);
259    for (byte[] row : ROWKEYS) {
260      Put p = new Put(row);
261      p.addColumn(FAM, Bytes.toBytes("val"), row);
262      puts.add(p);
263    }
264    tbl.put(puts);
265  }
266
267  /**
268   * Counts the number of rows to verify data loss or non-dataloss.
269   */
270  int countRows() throws IOException {
271    return TEST_UTIL.countRows(tbl);
272  }
273
274  /**
275   * Counts the number of rows to verify data loss or non-dataloss.
276   */
277  int countRows(byte[] start, byte[] end) throws IOException {
278    return TEST_UTIL.countRows(tbl, new Scan(start, end));
279  }
280
281  /**
282   * delete table in preparation for next test nn
283   */
284  void cleanupTable(TableName tablename) throws Exception {
285    if (tbl != null) {
286      tbl.close();
287      tbl = null;
288    }
289
290    ((ClusterConnection) connection).clearRegionLocationCache();
291    deleteTable(TEST_UTIL, tablename);
292  }
293
294  /**
295   * Get region info from local cluster.
296   */
297  Map<ServerName, List<String>> getDeployedHRIs(final Admin admin) throws IOException {
298    ClusterMetrics status = admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS));
299    Collection<ServerName> regionServers = status.getLiveServerMetrics().keySet();
300    Map<ServerName, List<String>> mm = new HashMap<>();
301    for (ServerName hsi : regionServers) {
302      AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
303
304      // list all online regions from this region server
305      List<RegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
306      List<String> regionNames = new ArrayList<>(regions.size());
307      for (RegionInfo hri : regions) {
308        regionNames.add(hri.getRegionNameAsString());
309      }
310      mm.put(hsi, regionNames);
311    }
312    return mm;
313  }
314
315  /**
316   * Returns the HSI a region info is on.
317   */
318  ServerName findDeployedHSI(Map<ServerName, List<String>> mm, RegionInfo hri) {
319    for (Map.Entry<ServerName, List<String>> e : mm.entrySet()) {
320      if (e.getValue().contains(hri.getRegionNameAsString())) {
321        return e.getKey();
322      }
323    }
324    return null;
325  }
326
327  public void deleteTableDir(TableName table) throws IOException {
328    Path rootDir = CommonFSUtils.getRootDir(conf);
329    FileSystem fs = rootDir.getFileSystem(conf);
330    Path p = CommonFSUtils.getTableDir(rootDir, table);
331    HBaseFsck.debugLsr(conf, p);
332    boolean success = fs.delete(p, true);
333    LOG.info("Deleted " + p + " sucessfully? " + success);
334  }
335
336  /**
337   * We don't have an easy way to verify that a flush completed, so we loop until we find a
338   * legitimate hfile and return it. nn * @return Path of a flushed hfile. n
339   */
340  Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
341    Path tableDir = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), table);
342    Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
343    Path famDir = new Path(regionDir, FAM_STR);
344
345    // keep doing this until we get a legit hfile
346    while (true) {
347      FileStatus[] hfFss = fs.listStatus(famDir);
348      if (hfFss.length == 0) {
349        continue;
350      }
351      for (FileStatus hfs : hfFss) {
352        if (!hfs.isDirectory()) {
353          return hfs.getPath();
354        }
355      }
356    }
357  }
358
359  /**
360   * Gets flushed mob files.
361   * @param fs    The current file system.
362   * @param table The current table name.
363   * @return Path of a flushed hfile. n
364   */
365  Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException {
366    Path famDir = MobUtils.getMobFamilyPath(conf, table, FAM_STR);
367
368    // keep doing this until we get a legit hfile
369    while (true) {
370      FileStatus[] hfFss = fs.listStatus(famDir);
371      if (hfFss.length == 0) {
372        continue;
373      }
374      for (FileStatus hfs : hfFss) {
375        if (!hfs.isDirectory()) {
376          return hfs.getPath();
377        }
378      }
379    }
380  }
381
382  /**
383   * Creates a new mob file name by the old one.
384   * @param oldFileName The old mob file name.
385   * @return The new mob file name.
386   */
387  String createMobFileName(String oldFileName) {
388    MobFileName mobFileName = MobFileName.create(oldFileName);
389    String startKey = mobFileName.getStartKey();
390    String date = mobFileName.getDate();
391    return MobFileName
392      .create(startKey, date, TEST_UTIL.getRandomUUID().toString().replaceAll("-", ""), "abcdef")
393      .getFileName();
394  }
395
396  /**
397   * Test that use this should have a timeout, because this method could potentially wait forever.
398   */
399  protected void doQuarantineTest(TableName table, HBaseFsck hbck, int check, int corrupt, int fail,
400    int quar, int missing) throws Exception {
401    try {
402      setupTable(table);
403      assertEquals(ROWKEYS.length, countRows());
404      admin.flush(table); // flush is async.
405
406      // Mess it up by leaving a hole in the assignment, meta, and hdfs data
407      admin.disableTable(table);
408
409      String[] args = { "-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
410        table.getNameAsString() };
411      HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
412
413      HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
414      assertEquals(hfcc.getHFilesChecked(), check);
415      assertEquals(hfcc.getCorrupted().size(), corrupt);
416      assertEquals(hfcc.getFailures().size(), fail);
417      assertEquals(hfcc.getQuarantined().size(), quar);
418      assertEquals(hfcc.getMissing().size(), missing);
419
420      // its been fixed, verify that we can enable
421      admin.enableTableAsync(table);
422      while (!admin.isTableEnabled(table)) {
423        try {
424          Thread.sleep(250);
425        } catch (InterruptedException e) {
426          e.printStackTrace();
427          fail("Interrupted when trying to enable table " + table);
428        }
429      }
430    } finally {
431      cleanupTable(table);
432    }
433  }
434
435  static class MockErrorReporter implements HbckErrorReporter {
436    static int calledCount = 0;
437
438    @Override
439    public void clear() {
440      calledCount++;
441    }
442
443    @Override
444    public void report(String message) {
445      calledCount++;
446    }
447
448    @Override
449    public void reportError(String message) {
450      calledCount++;
451    }
452
453    @Override
454    public void reportError(ERROR_CODE errorCode, String message) {
455      calledCount++;
456    }
457
458    @Override
459    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table) {
460      calledCount++;
461    }
462
463    @Override
464    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
465      HbckRegionInfo info) {
466      calledCount++;
467    }
468
469    @Override
470    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
471      HbckRegionInfo info1, HbckRegionInfo info2) {
472      calledCount++;
473    }
474
475    @Override
476    public int summarize() {
477      return ++calledCount;
478    }
479
480    @Override
481    public void detail(String details) {
482      calledCount++;
483    }
484
485    @Override
486    public ArrayList<ERROR_CODE> getErrorList() {
487      calledCount++;
488      return new ArrayList<>();
489    }
490
491    @Override
492    public void progress() {
493      calledCount++;
494    }
495
496    @Override
497    public void print(String message) {
498      calledCount++;
499    }
500
501    @Override
502    public void resetErrors() {
503      calledCount++;
504    }
505
506    @Override
507    public boolean tableHasErrors(HbckTableInfo table) {
508      calledCount++;
509      return false;
510    }
511  }
512
513  protected void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
514    boolean regionInfoOnly) throws IOException, InterruptedException {
515    HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
516      .getRegionLocation(HConstants.EMPTY_START_ROW);
517    ServerName hsa = metaLocation.getServerName();
518    RegionInfo hri = metaLocation.getRegionInfo();
519    if (unassign) {
520      LOG.info("Undeploying meta region " + hri + " from server " + hsa);
521      try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
522        undeployRegion(unmanagedConnection, hsa, hri);
523      }
524    }
525
526    if (regionInfoOnly) {
527      LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
528      Path rootDir = CommonFSUtils.getRootDir(conf);
529      FileSystem fs = rootDir.getFileSystem(conf);
530      Path p =
531        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
532      Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
533      fs.delete(hriPath, true);
534    }
535
536    if (hdfs) {
537      LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
538      Path rootDir = CommonFSUtils.getRootDir(conf);
539      FileSystem fs = rootDir.getFileSystem(conf);
540      Path p =
541        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
542      HBaseFsck.debugLsr(conf, p);
543      boolean success = fs.delete(p, true);
544      LOG.info("Deleted " + p + " sucessfully? " + success);
545      HBaseFsck.debugLsr(conf, p);
546    }
547  }
548
549  @org.junit.Rule
550  public TestName name = new TestName();
551
552  public static class MasterSyncCoprocessor implements MasterCoprocessor, MasterObserver {
553    volatile CountDownLatch tableCreationLatch = null;
554    volatile CountDownLatch tableDeletionLatch = null;
555
556    @Override
557    public Optional<MasterObserver> getMasterObserver() {
558      return Optional.of(this);
559    }
560
561    @Override
562    public void postCompletedCreateTableAction(
563      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableDescriptor desc,
564      final RegionInfo[] regions) throws IOException {
565      // the AccessController test, some times calls only and directly the
566      // postCompletedCreateTableAction()
567      if (tableCreationLatch != null) {
568        tableCreationLatch.countDown();
569      }
570    }
571
572    @Override
573    public void postCompletedDeleteTableAction(
574      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableName tableName)
575      throws IOException {
576      // the AccessController test, some times calls only and directly the
577      // postCompletedDeleteTableAction()
578      if (tableDeletionLatch != null) {
579        tableDeletionLatch.countDown();
580      }
581    }
582  }
583
584  public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd,
585    byte[][] splitKeys) throws Exception {
586    // NOTE: We need a latch because admin is not sync,
587    // so the postOp coprocessor method may be called after the admin operation returned.
588    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
589      .findCoprocessor(MasterSyncCoprocessor.class);
590    coproc.tableCreationLatch = new CountDownLatch(1);
591    if (splitKeys != null) {
592      admin.createTable(htd, splitKeys);
593    } else {
594      admin.createTable(htd);
595    }
596    coproc.tableCreationLatch.await();
597    coproc.tableCreationLatch = null;
598    testUtil.waitUntilAllRegionsAssigned(htd.getTableName());
599  }
600
601  public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName)
602    throws Exception {
603    // NOTE: We need a latch because admin is not sync,
604    // so the postOp coprocessor method may be called after the admin operation returned.
605    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
606      .findCoprocessor(MasterSyncCoprocessor.class);
607    coproc.tableDeletionLatch = new CountDownLatch(1);
608    try {
609      admin.disableTable(tableName);
610    } catch (Exception e) {
611      LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
612    }
613    admin.deleteTable(tableName);
614    coproc.tableDeletionLatch.await();
615    coproc.tableDeletionLatch = null;
616  }
617}