001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.util; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.fail; 022 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.Collection; 026import java.util.EnumSet; 027import java.util.HashMap; 028import java.util.List; 029import java.util.Map; 030import java.util.Optional; 031import java.util.concurrent.CountDownLatch; 032import java.util.concurrent.ExecutorService; 033import java.util.concurrent.ScheduledThreadPoolExecutor; 034import org.apache.hadoop.conf.Configuration; 035import org.apache.hadoop.fs.FileStatus; 036import org.apache.hadoop.fs.FileSystem; 037import org.apache.hadoop.fs.Path; 038import org.apache.hadoop.hbase.ClusterMetrics; 039import org.apache.hadoop.hbase.ClusterMetrics.Option; 040import org.apache.hadoop.hbase.HBaseTestingUtility; 041import org.apache.hadoop.hbase.HColumnDescriptor; 042import org.apache.hadoop.hbase.HConstants; 043import org.apache.hadoop.hbase.HRegionLocation; 044import org.apache.hadoop.hbase.HTableDescriptor; 045import org.apache.hadoop.hbase.ServerName; 046import org.apache.hadoop.hbase.TableName; 047import org.apache.hadoop.hbase.client.Admin; 048import org.apache.hadoop.hbase.client.ClusterConnection; 049import org.apache.hadoop.hbase.client.Connection; 050import org.apache.hadoop.hbase.client.ConnectionFactory; 051import org.apache.hadoop.hbase.client.Delete; 052import org.apache.hadoop.hbase.client.Put; 053import org.apache.hadoop.hbase.client.RegionInfo; 054import org.apache.hadoop.hbase.client.RegionLocator; 055import org.apache.hadoop.hbase.client.Scan; 056import org.apache.hadoop.hbase.client.Table; 057import org.apache.hadoop.hbase.client.TableDescriptor; 058import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor; 059import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; 060import org.apache.hadoop.hbase.coprocessor.MasterObserver; 061import org.apache.hadoop.hbase.coprocessor.ObserverContext; 062import org.apache.hadoop.hbase.master.assignment.AssignmentManager; 063import org.apache.hadoop.hbase.master.assignment.RegionStates; 064import org.apache.hadoop.hbase.mob.MobFileName; 065import org.apache.hadoop.hbase.mob.MobUtils; 066import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; 067import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker; 068import org.junit.rules.TestName; 069import org.slf4j.Logger; 070import org.slf4j.LoggerFactory; 071 072import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 073import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos; 074 075/** 076 * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables. Actual 077 * tests are in : TestHBaseFsckTwoRS TestHBaseFsckOneRS TestHBaseFsckMOB TestHBaseFsckReplicas 078 */ 079public class BaseTestHBaseFsck { 080 static final int POOL_SIZE = 7; 081 protected static final Logger LOG = LoggerFactory.getLogger(BaseTestHBaseFsck.class); 082 protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 083 protected final static Configuration conf = TEST_UTIL.getConfiguration(); 084 protected final static String FAM_STR = "fam"; 085 protected final static byte[] FAM = Bytes.toBytes(FAM_STR); 086 protected final static int REGION_ONLINE_TIMEOUT = 800; 087 protected static AssignmentManager assignmentManager; 088 protected static RegionStates regionStates; 089 protected static ExecutorService tableExecutorService; 090 protected static ScheduledThreadPoolExecutor hbfsckExecutorService; 091 protected static ClusterConnection connection; 092 protected static Admin admin; 093 094 // for the instance, reset every test run 095 protected Table tbl; 096 protected final static byte[][] SPLITS = 097 new byte[][] { Bytes.toBytes("A"), Bytes.toBytes("B"), Bytes.toBytes("C") }; 098 // one row per region. 099 protected final static byte[][] ROWKEYS = new byte[][] { Bytes.toBytes("00"), Bytes.toBytes("50"), 100 Bytes.toBytes("A0"), Bytes.toBytes("A5"), Bytes.toBytes("B0"), Bytes.toBytes("B5"), 101 Bytes.toBytes("C0"), Bytes.toBytes("C5") }; 102 103 /** 104 * Debugging method to dump the contents of meta. 105 */ 106 protected void dumpMeta(TableName tableName) throws IOException { 107 List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName); 108 for (byte[] row : metaRows) { 109 LOG.info(Bytes.toString(row)); 110 } 111 } 112 113 /** 114 * This method is used to undeploy a region -- close it and attempt to remove its state from the 115 * Master. 116 */ 117 protected void undeployRegion(Connection conn, ServerName sn, RegionInfo hri) 118 throws IOException, InterruptedException { 119 try { 120 HBaseFsckRepair.closeRegionSilentlyAndWait(conn, sn, hri); 121 if (!hri.isMetaRegion()) { 122 admin.offline(hri.getRegionName()); 123 } 124 } catch (IOException ioe) { 125 LOG.warn( 126 "Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()), 127 ioe); 128 } 129 } 130 131 /** 132 * Delete a region from assignments, meta, or completely from hdfs. 133 * @param unassign if true unassign region if assigned 134 * @param metaRow if true remove region's row from META 135 * @param hdfs if true remove region's dir in HDFS 136 */ 137 protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey, 138 byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs) 139 throws IOException, InterruptedException { 140 deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false, 141 RegionInfo.DEFAULT_REPLICA_ID); 142 } 143 144 /** 145 * Delete a region from assignments, meta, or completely from hdfs. 146 * @param unassign if true unassign region if assigned 147 * @param metaRow if true remove region's row from META 148 * @param hdfs if true remove region's dir in HDFS 149 * @param regionInfoOnly if true remove a region dir's .regioninfo file 150 * @param replicaId replica id 151 */ 152 protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey, 153 byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs, boolean regionInfoOnly, 154 int replicaId) throws IOException, InterruptedException { 155 LOG.info("** Before delete:"); 156 dumpMeta(htd.getTableName()); 157 158 List<HRegionLocation> locations; 159 try (RegionLocator rl = connection.getRegionLocator(tbl.getName())) { 160 locations = rl.getAllRegionLocations(); 161 } 162 163 for (HRegionLocation location : locations) { 164 RegionInfo hri = location.getRegionInfo(); 165 ServerName hsa = location.getServerName(); 166 if ( 167 Bytes.compareTo(hri.getStartKey(), startKey) == 0 168 && Bytes.compareTo(hri.getEndKey(), endKey) == 0 && hri.getReplicaId() == replicaId 169 ) { 170 171 LOG.info("RegionName: " + hri.getRegionNameAsString()); 172 byte[] deleteRow = hri.getRegionName(); 173 174 if (unassign) { 175 LOG.info("Undeploying region " + hri + " from server " + hsa); 176 undeployRegion(connection, hsa, hri); 177 } 178 179 if (regionInfoOnly) { 180 LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString()); 181 Path rootDir = CommonFSUtils.getRootDir(conf); 182 FileSystem fs = rootDir.getFileSystem(conf); 183 Path p = 184 new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName()); 185 Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE); 186 fs.delete(hriPath, true); 187 } 188 189 if (hdfs) { 190 LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString()); 191 Path rootDir = CommonFSUtils.getRootDir(conf); 192 FileSystem fs = rootDir.getFileSystem(conf); 193 Path p = 194 new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName()); 195 HBaseFsck.debugLsr(conf, p); 196 boolean success = fs.delete(p, true); 197 LOG.info("Deleted " + p + " sucessfully? " + success); 198 HBaseFsck.debugLsr(conf, p); 199 } 200 201 if (metaRow) { 202 try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) { 203 Delete delete = new Delete(deleteRow); 204 meta.delete(delete); 205 } 206 } 207 } 208 LOG.info(hri.toString() + hsa.toString()); 209 } 210 211 TEST_UTIL.getMetaTableRows(htd.getTableName()); 212 LOG.info("*** After delete:"); 213 dumpMeta(htd.getTableName()); 214 } 215 216 /** 217 * Setup a clean table before we start mucking with it. It will set tbl which needs to be closed 218 * after test nnn 219 */ 220 void setupTable(TableName tablename) throws Exception { 221 setupTableWithRegionReplica(tablename, 1); 222 } 223 224 /** 225 * Setup a clean table with a certain region_replica count It will set tbl which needs to be 226 * closed after test n 227 */ 228 void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception { 229 HTableDescriptor desc = new HTableDescriptor(tablename); 230 desc.setRegionReplication(replicaCount); 231 HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM)); 232 desc.addFamily(hcd); // If a table has no CF's it doesn't get checked 233 createTable(TEST_UTIL, desc, SPLITS); 234 235 tbl = connection.getTable(tablename, tableExecutorService); 236 List<Put> puts = new ArrayList<>(ROWKEYS.length); 237 for (byte[] row : ROWKEYS) { 238 Put p = new Put(row); 239 p.addColumn(FAM, Bytes.toBytes("val"), row); 240 puts.add(p); 241 } 242 tbl.put(puts); 243 } 244 245 /** 246 * Setup a clean table with a mob-enabled column. 247 * @param tablename The name of a table to be created. n 248 */ 249 void setupMobTable(TableName tablename) throws Exception { 250 HTableDescriptor desc = new HTableDescriptor(tablename); 251 HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM)); 252 hcd.setMobEnabled(true); 253 hcd.setMobThreshold(0); 254 desc.addFamily(hcd); // If a table has no CF's it doesn't get checked 255 createTable(TEST_UTIL, desc, SPLITS); 256 257 tbl = connection.getTable(tablename, tableExecutorService); 258 List<Put> puts = new ArrayList<>(ROWKEYS.length); 259 for (byte[] row : ROWKEYS) { 260 Put p = new Put(row); 261 p.addColumn(FAM, Bytes.toBytes("val"), row); 262 puts.add(p); 263 } 264 tbl.put(puts); 265 } 266 267 /** 268 * Counts the number of rows to verify data loss or non-dataloss. 269 */ 270 int countRows() throws IOException { 271 return TEST_UTIL.countRows(tbl); 272 } 273 274 /** 275 * Counts the number of rows to verify data loss or non-dataloss. 276 */ 277 int countRows(byte[] start, byte[] end) throws IOException { 278 return TEST_UTIL.countRows(tbl, new Scan(start, end)); 279 } 280 281 /** 282 * delete table in preparation for next test nn 283 */ 284 void cleanupTable(TableName tablename) throws Exception { 285 if (tbl != null) { 286 tbl.close(); 287 tbl = null; 288 } 289 290 ((ClusterConnection) connection).clearRegionLocationCache(); 291 deleteTable(TEST_UTIL, tablename); 292 } 293 294 /** 295 * Get region info from local cluster. 296 */ 297 Map<ServerName, List<String>> getDeployedHRIs(final Admin admin) throws IOException { 298 ClusterMetrics status = admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)); 299 Collection<ServerName> regionServers = status.getLiveServerMetrics().keySet(); 300 Map<ServerName, List<String>> mm = new HashMap<>(); 301 for (ServerName hsi : regionServers) { 302 AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi); 303 304 // list all online regions from this region server 305 List<RegionInfo> regions = ProtobufUtil.getOnlineRegions(server); 306 List<String> regionNames = new ArrayList<>(regions.size()); 307 for (RegionInfo hri : regions) { 308 regionNames.add(hri.getRegionNameAsString()); 309 } 310 mm.put(hsi, regionNames); 311 } 312 return mm; 313 } 314 315 /** 316 * Returns the HSI a region info is on. 317 */ 318 ServerName findDeployedHSI(Map<ServerName, List<String>> mm, RegionInfo hri) { 319 for (Map.Entry<ServerName, List<String>> e : mm.entrySet()) { 320 if (e.getValue().contains(hri.getRegionNameAsString())) { 321 return e.getKey(); 322 } 323 } 324 return null; 325 } 326 327 public void deleteTableDir(TableName table) throws IOException { 328 Path rootDir = CommonFSUtils.getRootDir(conf); 329 FileSystem fs = rootDir.getFileSystem(conf); 330 Path p = CommonFSUtils.getTableDir(rootDir, table); 331 HBaseFsck.debugLsr(conf, p); 332 boolean success = fs.delete(p, true); 333 LOG.info("Deleted " + p + " sucessfully? " + success); 334 } 335 336 /** 337 * We don't have an easy way to verify that a flush completed, so we loop until we find a 338 * legitimate hfile and return it. nn * @return Path of a flushed hfile. n 339 */ 340 Path getFlushedHFile(FileSystem fs, TableName table) throws IOException { 341 Path tableDir = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), table); 342 Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0); 343 Path famDir = new Path(regionDir, FAM_STR); 344 345 // keep doing this until we get a legit hfile 346 while (true) { 347 FileStatus[] hfFss = fs.listStatus(famDir); 348 if (hfFss.length == 0) { 349 continue; 350 } 351 for (FileStatus hfs : hfFss) { 352 if (!hfs.isDirectory()) { 353 return hfs.getPath(); 354 } 355 } 356 } 357 } 358 359 /** 360 * Gets flushed mob files. 361 * @param fs The current file system. 362 * @param table The current table name. 363 * @return Path of a flushed hfile. n 364 */ 365 Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException { 366 Path famDir = MobUtils.getMobFamilyPath(conf, table, FAM_STR); 367 368 // keep doing this until we get a legit hfile 369 while (true) { 370 FileStatus[] hfFss = fs.listStatus(famDir); 371 if (hfFss.length == 0) { 372 continue; 373 } 374 for (FileStatus hfs : hfFss) { 375 if (!hfs.isDirectory()) { 376 return hfs.getPath(); 377 } 378 } 379 } 380 } 381 382 /** 383 * Creates a new mob file name by the old one. 384 * @param oldFileName The old mob file name. 385 * @return The new mob file name. 386 */ 387 String createMobFileName(String oldFileName) { 388 MobFileName mobFileName = MobFileName.create(oldFileName); 389 String startKey = mobFileName.getStartKey(); 390 String date = mobFileName.getDate(); 391 return MobFileName 392 .create(startKey, date, TEST_UTIL.getRandomUUID().toString().replaceAll("-", ""), "abcdef") 393 .getFileName(); 394 } 395 396 /** 397 * Test that use this should have a timeout, because this method could potentially wait forever. 398 */ 399 protected void doQuarantineTest(TableName table, HBaseFsck hbck, int check, int corrupt, int fail, 400 int quar, int missing) throws Exception { 401 try { 402 setupTable(table); 403 assertEquals(ROWKEYS.length, countRows()); 404 admin.flush(table); // flush is async. 405 406 // Mess it up by leaving a hole in the assignment, meta, and hdfs data 407 admin.disableTable(table); 408 409 String[] args = { "-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission", 410 table.getNameAsString() }; 411 HBaseFsck res = hbck.exec(hbfsckExecutorService, args); 412 413 HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker(); 414 assertEquals(hfcc.getHFilesChecked(), check); 415 assertEquals(hfcc.getCorrupted().size(), corrupt); 416 assertEquals(hfcc.getFailures().size(), fail); 417 assertEquals(hfcc.getQuarantined().size(), quar); 418 assertEquals(hfcc.getMissing().size(), missing); 419 420 // its been fixed, verify that we can enable 421 admin.enableTableAsync(table); 422 while (!admin.isTableEnabled(table)) { 423 try { 424 Thread.sleep(250); 425 } catch (InterruptedException e) { 426 e.printStackTrace(); 427 fail("Interrupted when trying to enable table " + table); 428 } 429 } 430 } finally { 431 cleanupTable(table); 432 } 433 } 434 435 static class MockErrorReporter implements HbckErrorReporter { 436 static int calledCount = 0; 437 438 @Override 439 public void clear() { 440 calledCount++; 441 } 442 443 @Override 444 public void report(String message) { 445 calledCount++; 446 } 447 448 @Override 449 public void reportError(String message) { 450 calledCount++; 451 } 452 453 @Override 454 public void reportError(ERROR_CODE errorCode, String message) { 455 calledCount++; 456 } 457 458 @Override 459 public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table) { 460 calledCount++; 461 } 462 463 @Override 464 public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table, 465 HbckRegionInfo info) { 466 calledCount++; 467 } 468 469 @Override 470 public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table, 471 HbckRegionInfo info1, HbckRegionInfo info2) { 472 calledCount++; 473 } 474 475 @Override 476 public int summarize() { 477 return ++calledCount; 478 } 479 480 @Override 481 public void detail(String details) { 482 calledCount++; 483 } 484 485 @Override 486 public ArrayList<ERROR_CODE> getErrorList() { 487 calledCount++; 488 return new ArrayList<>(); 489 } 490 491 @Override 492 public void progress() { 493 calledCount++; 494 } 495 496 @Override 497 public void print(String message) { 498 calledCount++; 499 } 500 501 @Override 502 public void resetErrors() { 503 calledCount++; 504 } 505 506 @Override 507 public boolean tableHasErrors(HbckTableInfo table) { 508 calledCount++; 509 return false; 510 } 511 } 512 513 protected void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs, 514 boolean regionInfoOnly) throws IOException, InterruptedException { 515 HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME) 516 .getRegionLocation(HConstants.EMPTY_START_ROW); 517 ServerName hsa = metaLocation.getServerName(); 518 RegionInfo hri = metaLocation.getRegionInfo(); 519 if (unassign) { 520 LOG.info("Undeploying meta region " + hri + " from server " + hsa); 521 try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) { 522 undeployRegion(unmanagedConnection, hsa, hri); 523 } 524 } 525 526 if (regionInfoOnly) { 527 LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString()); 528 Path rootDir = CommonFSUtils.getRootDir(conf); 529 FileSystem fs = rootDir.getFileSystem(conf); 530 Path p = 531 new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName()); 532 Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE); 533 fs.delete(hriPath, true); 534 } 535 536 if (hdfs) { 537 LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString()); 538 Path rootDir = CommonFSUtils.getRootDir(conf); 539 FileSystem fs = rootDir.getFileSystem(conf); 540 Path p = 541 new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName()); 542 HBaseFsck.debugLsr(conf, p); 543 boolean success = fs.delete(p, true); 544 LOG.info("Deleted " + p + " sucessfully? " + success); 545 HBaseFsck.debugLsr(conf, p); 546 } 547 } 548 549 @org.junit.Rule 550 public TestName name = new TestName(); 551 552 public static class MasterSyncCoprocessor implements MasterCoprocessor, MasterObserver { 553 volatile CountDownLatch tableCreationLatch = null; 554 volatile CountDownLatch tableDeletionLatch = null; 555 556 @Override 557 public Optional<MasterObserver> getMasterObserver() { 558 return Optional.of(this); 559 } 560 561 @Override 562 public void postCompletedCreateTableAction( 563 final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableDescriptor desc, 564 final RegionInfo[] regions) throws IOException { 565 // the AccessController test, some times calls only and directly the 566 // postCompletedCreateTableAction() 567 if (tableCreationLatch != null) { 568 tableCreationLatch.countDown(); 569 } 570 } 571 572 @Override 573 public void postCompletedDeleteTableAction( 574 final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableName tableName) 575 throws IOException { 576 // the AccessController test, some times calls only and directly the 577 // postCompletedDeleteTableAction() 578 if (tableDeletionLatch != null) { 579 tableDeletionLatch.countDown(); 580 } 581 } 582 } 583 584 public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd, 585 byte[][] splitKeys) throws Exception { 586 // NOTE: We need a latch because admin is not sync, 587 // so the postOp coprocessor method may be called after the admin operation returned. 588 MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost() 589 .findCoprocessor(MasterSyncCoprocessor.class); 590 coproc.tableCreationLatch = new CountDownLatch(1); 591 if (splitKeys != null) { 592 admin.createTable(htd, splitKeys); 593 } else { 594 admin.createTable(htd); 595 } 596 coproc.tableCreationLatch.await(); 597 coproc.tableCreationLatch = null; 598 testUtil.waitUntilAllRegionsAssigned(htd.getTableName()); 599 } 600 601 public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName) 602 throws Exception { 603 // NOTE: We need a latch because admin is not sync, 604 // so the postOp coprocessor method may be called after the admin operation returned. 605 MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost() 606 .findCoprocessor(MasterSyncCoprocessor.class); 607 coproc.tableDeletionLatch = new CountDownLatch(1); 608 try { 609 admin.disableTable(tableName); 610 } catch (Exception e) { 611 LOG.debug("Table: " + tableName + " already disabled, so just deleting it."); 612 } 613 admin.deleteTable(tableName); 614 coproc.tableDeletionLatch.await(); 615 coproc.tableDeletionLatch = null; 616 } 617}