1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util;
19
20 import java.io.Closeable;
21 import java.io.FileNotFoundException;
22 import java.io.IOException;
23 import java.io.InterruptedIOException;
24 import java.io.PrintWriter;
25 import java.io.StringWriter;
26 import java.net.InetAddress;
27 import java.net.URI;
28 import java.util.ArrayList;
29 import java.util.Arrays;
30 import java.util.Collection;
31 import java.util.Collections;
32 import java.util.Comparator;
33 import java.util.HashMap;
34 import java.util.HashSet;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.Map;
38 import java.util.Map.Entry;
39 import java.util.Set;
40 import java.util.SortedMap;
41 import java.util.SortedSet;
42 import java.util.TreeMap;
43 import java.util.TreeSet;
44 import java.util.concurrent.Callable;
45 import java.util.concurrent.ConcurrentSkipListMap;
46 import java.util.concurrent.ExecutionException;
47 import java.util.concurrent.ExecutorService;
48 import java.util.concurrent.Executors;
49 import java.util.concurrent.Future;
50 import java.util.concurrent.FutureTask;
51 import java.util.concurrent.ScheduledThreadPoolExecutor;
52 import java.util.concurrent.TimeUnit;
53 import java.util.concurrent.TimeoutException;
54 import java.util.concurrent.atomic.AtomicBoolean;
55 import java.util.concurrent.atomic.AtomicInteger;
56
57 import com.google.common.base.Joiner;
58 import com.google.common.base.Preconditions;
59 import com.google.common.collect.ImmutableList;
60 import com.google.common.collect.Lists;
61 import com.google.common.collect.Multimap;
62 import com.google.common.collect.Ordering;
63 import com.google.common.collect.TreeMultimap;
64 import com.google.protobuf.ServiceException;
65
66 import org.apache.commons.lang.StringUtils;
67 import org.apache.commons.logging.Log;
68 import org.apache.commons.logging.LogFactory;
69 import org.apache.hadoop.hbase.classification.InterfaceAudience;
70 import org.apache.hadoop.hbase.classification.InterfaceStability;
71 import org.apache.hadoop.conf.Configuration;
72 import org.apache.hadoop.conf.Configured;
73 import org.apache.hadoop.fs.FSDataOutputStream;
74 import org.apache.hadoop.fs.FileStatus;
75 import org.apache.hadoop.fs.FileSystem;
76 import org.apache.hadoop.fs.Path;
77 import org.apache.hadoop.fs.permission.FsAction;
78 import org.apache.hadoop.fs.permission.FsPermission;
79 import org.apache.hadoop.hbase.Abortable;
80 import org.apache.hadoop.hbase.Cell;
81 import org.apache.hadoop.hbase.ClusterStatus;
82 import org.apache.hadoop.hbase.CoordinatedStateException;
83 import org.apache.hadoop.hbase.HBaseConfiguration;
84 import org.apache.hadoop.hbase.HBaseInterfaceAudience;
85 import org.apache.hadoop.hbase.HColumnDescriptor;
86 import org.apache.hadoop.hbase.HConstants;
87 import org.apache.hadoop.hbase.HRegionInfo;
88 import org.apache.hadoop.hbase.HRegionLocation;
89 import org.apache.hadoop.hbase.HTableDescriptor;
90 import org.apache.hadoop.hbase.KeyValue;
91 import org.apache.hadoop.hbase.MasterNotRunningException;
92 import org.apache.hadoop.hbase.RegionLocations;
93 import org.apache.hadoop.hbase.ServerName;
94 import org.apache.hadoop.hbase.TableName;
95 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
96 import org.apache.hadoop.hbase.MetaTableAccessor;
97 import org.apache.hadoop.hbase.classification.InterfaceAudience;
98 import org.apache.hadoop.hbase.classification.InterfaceStability;
99 import org.apache.hadoop.hbase.client.Admin;
100 import org.apache.hadoop.hbase.client.ClusterConnection;
101 import org.apache.hadoop.hbase.client.ConnectionFactory;
102 import org.apache.hadoop.hbase.client.Delete;
103 import org.apache.hadoop.hbase.client.Get;
104 import org.apache.hadoop.hbase.client.HBaseAdmin;
105 import org.apache.hadoop.hbase.client.HConnectable;
106 import org.apache.hadoop.hbase.client.HConnection;
107 import org.apache.hadoop.hbase.client.HConnectionManager;
108 import org.apache.hadoop.hbase.client.MetaScanner;
109 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
110 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
111 import org.apache.hadoop.hbase.client.Put;
112 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
113 import org.apache.hadoop.hbase.client.Result;
114 import org.apache.hadoop.hbase.client.RowMutations;
115 import org.apache.hadoop.hbase.client.Table;
116 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
117 import org.apache.hadoop.hbase.io.hfile.HFile;
118 import org.apache.hadoop.hbase.master.MasterFileSystem;
119 import org.apache.hadoop.hbase.master.RegionState;
120 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
121 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
122 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
123 import org.apache.hadoop.hbase.regionserver.HRegion;
124 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
125 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
126 import org.apache.hadoop.hbase.security.AccessDeniedException;
127 import org.apache.hadoop.hbase.security.UserProvider;
128 import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
129 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
130 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
131 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler;
132 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl;
133 import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
134 import org.apache.hadoop.hbase.wal.WALSplitter;
135 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
136 import org.apache.hadoop.hbase.zookeeper.ZKTableStateClientSideReader;
137 import org.apache.hadoop.hbase.zookeeper.ZKTableStateManager;
138 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
139 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
140 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
141 import org.apache.hadoop.io.IOUtils;
142 import org.apache.hadoop.ipc.RemoteException;
143 import org.apache.hadoop.security.UserGroupInformation;
144 import org.apache.hadoop.util.ReflectionUtils;
145 import org.apache.hadoop.util.Tool;
146 import org.apache.hadoop.util.ToolRunner;
147 import org.apache.zookeeper.KeeperException;
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
195 @InterfaceStability.Evolving
196 public class HBaseFsck extends Configured implements Closeable {
197 public static final long DEFAULT_TIME_LAG = 60000;
198 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
199 private static final int MAX_NUM_THREADS = 50;
200 private static boolean rsSupportsOffline = true;
201 private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
202 private static final int DEFAULT_MAX_MERGE = 5;
203 private static final String TO_BE_LOADED = "to_be_loaded";
204 private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
205 private static final int DEFAULT_MAX_LOCK_FILE_ATTEMPTS = 5;
206 private static final int DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL = 200;
207 private static final int DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME = 5000;
208
209
210
211
212 private static final int DEFAULT_WAIT_FOR_LOCK_TIMEOUT = 80;
213
214
215
216
217 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
218 private ClusterStatus status;
219 private ClusterConnection connection;
220 private Admin admin;
221 private Table meta;
222
223 protected ExecutorService executor;
224 private long startMillis = EnvironmentEdgeManager.currentTime();
225 private HFileCorruptionChecker hfcc;
226 private int retcode = 0;
227 private Path HBCK_LOCK_PATH;
228 private FSDataOutputStream hbckOutFd;
229
230
231
232 private final AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
233
234
235
236
237 private static boolean details = false;
238 private long timelag = DEFAULT_TIME_LAG;
239 private boolean fixAssignments = false;
240 private boolean fixMeta = false;
241 private boolean checkHdfs = true;
242 private boolean fixHdfsHoles = false;
243 private boolean fixHdfsOverlaps = false;
244 private boolean fixHdfsOrphans = false;
245 private boolean fixTableOrphans = false;
246 private boolean fixVersionFile = false;
247 private boolean fixSplitParents = false;
248 private boolean fixReferenceFiles = false;
249 private boolean fixEmptyMetaCells = false;
250 private boolean fixTableLocks = false;
251 private boolean fixTableZNodes = false;
252 private boolean fixAny = false;
253
254
255
256 private Set<TableName> tablesIncluded = new HashSet<TableName>();
257 private int maxMerge = DEFAULT_MAX_MERGE;
258 private int maxOverlapsToSideline = DEFAULT_OVERLAPS_TO_SIDELINE;
259 private boolean sidelineBigOverlaps = false;
260 private Path sidelineDir = null;
261
262 private boolean rerun = false;
263 private static boolean summary = false;
264 private boolean checkMetaOnly = false;
265 private boolean checkRegionBoundaries = false;
266 private boolean ignorePreCheckPermission = false;
267
268
269
270
271 final private ErrorReporter errors;
272 int fixes = 0;
273
274
275
276
277
278
279 private TreeMap<String, HbckInfo> regionInfoMap = new TreeMap<String, HbckInfo>();
280 private TreeSet<TableName> disabledTables =
281 new TreeSet<TableName>();
282
283 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
284
285
286
287
288
289
290
291
292
293
294
295 private SortedMap<TableName, TableInfo> tablesInfo =
296 new ConcurrentSkipListMap<TableName, TableInfo>();
297
298
299
300
301 private List<HbckInfo> orphanHdfsDirs = Collections.synchronizedList(new ArrayList<HbckInfo>());
302
303 private Map<TableName, Set<String>> orphanTableDirs =
304 new HashMap<TableName, Set<String>>();
305
306 private Map<TableName, Set<String>> skippedRegions = new HashMap<TableName, Set<String>>();
307
308
309
310
311 private Set<TableName> orphanedTableZNodes = new HashSet<TableName>();
312 private final RetryCounterFactory lockFileRetryCounterFactory;
313
314
315
316
317
318
319
320
321
322 public HBaseFsck(Configuration conf) throws MasterNotRunningException,
323 ZooKeeperConnectionException, IOException, ClassNotFoundException {
324 this(conf, createThreadPool(conf));
325 }
326
327 private static ExecutorService createThreadPool(Configuration conf) {
328 int numThreads = conf.getInt("hbasefsck.numthreads", MAX_NUM_THREADS);
329 return new ScheduledThreadPoolExecutor(numThreads, Threads.newDaemonThreadFactory("hbasefsck"));
330 }
331
332
333
334
335
336
337
338
339
340
341
342 public HBaseFsck(Configuration conf, ExecutorService exec) throws MasterNotRunningException,
343 ZooKeeperConnectionException, IOException, ClassNotFoundException {
344 super(conf);
345 errors = getErrorReporter(getConf());
346 this.executor = exec;
347 lockFileRetryCounterFactory = new RetryCounterFactory(
348 getConf().getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS),
349 getConf().getInt(
350 "hbase.hbck.lockfile.attempt.sleep.interval", DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL),
351 getConf().getInt(
352 "hbase.hbck.lockfile.attempt.maxsleeptime", DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME));
353 }
354
355 private class FileLockCallable implements Callable<FSDataOutputStream> {
356 RetryCounter retryCounter;
357
358 public FileLockCallable(RetryCounter retryCounter) {
359 this.retryCounter = retryCounter;
360 }
361 @Override
362 public FSDataOutputStream call() throws IOException {
363 try {
364 FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
365 FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
366 HConstants.DATA_FILE_UMASK_KEY);
367 Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
368 fs.mkdirs(tmpDir);
369 HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
370 final FSDataOutputStream out = createFileWithRetries(fs, HBCK_LOCK_PATH, defaultPerms);
371 out.writeBytes(InetAddress.getLocalHost().toString());
372 out.flush();
373 return out;
374 } catch(RemoteException e) {
375 if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
376 return null;
377 } else {
378 throw e;
379 }
380 }
381 }
382
383 private FSDataOutputStream createFileWithRetries(final FileSystem fs,
384 final Path hbckLockFilePath, final FsPermission defaultPerms)
385 throws IOException {
386
387 IOException exception = null;
388 do {
389 try {
390 return FSUtils.create(fs, hbckLockFilePath, defaultPerms, false);
391 } catch (IOException ioe) {
392 LOG.info("Failed to create lock file " + hbckLockFilePath.getName()
393 + ", try=" + (retryCounter.getAttemptTimes() + 1) + " of "
394 + retryCounter.getMaxAttempts());
395 LOG.debug("Failed to create lock file " + hbckLockFilePath.getName(),
396 ioe);
397 try {
398 exception = ioe;
399 retryCounter.sleepUntilNextRetry();
400 } catch (InterruptedException ie) {
401 throw (InterruptedIOException) new InterruptedIOException(
402 "Can't create lock file " + hbckLockFilePath.getName())
403 .initCause(ie);
404 }
405 }
406 } while (retryCounter.shouldRetry());
407
408 throw exception;
409 }
410 }
411
412
413
414
415
416
417
418 private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
419 RetryCounter retryCounter = lockFileRetryCounterFactory.create();
420 FileLockCallable callable = new FileLockCallable(retryCounter);
421 ExecutorService executor = Executors.newFixedThreadPool(1);
422 FutureTask<FSDataOutputStream> futureTask = new FutureTask<FSDataOutputStream>(callable);
423 executor.execute(futureTask);
424 final int timeoutInSeconds = getConf().getInt(
425 "hbase.hbck.lockfile.maxwaittime", DEFAULT_WAIT_FOR_LOCK_TIMEOUT);
426 FSDataOutputStream stream = null;
427 try {
428 stream = futureTask.get(timeoutInSeconds, TimeUnit.SECONDS);
429 } catch (ExecutionException ee) {
430 LOG.warn("Encountered exception when opening lock file", ee);
431 } catch (InterruptedException ie) {
432 LOG.warn("Interrupted when opening lock file", ie);
433 Thread.currentThread().interrupt();
434 } catch (TimeoutException exception) {
435
436 LOG.warn("Took more than " + timeoutInSeconds + " seconds in obtaining lock");
437 futureTask.cancel(true);
438 } finally {
439 executor.shutdownNow();
440 }
441 return stream;
442 }
443
444 private void unlockHbck() {
445 if (hbckLockCleanup.compareAndSet(true, false)) {
446 RetryCounter retryCounter = lockFileRetryCounterFactory.create();
447 do {
448 try {
449 IOUtils.closeStream(hbckOutFd);
450 FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()),
451 HBCK_LOCK_PATH, true);
452 LOG.info("Finishing hbck");
453 return;
454 } catch (IOException ioe) {
455 LOG.info("Failed to delete " + HBCK_LOCK_PATH + ", try="
456 + (retryCounter.getAttemptTimes() + 1) + " of "
457 + retryCounter.getMaxAttempts());
458 LOG.debug("Failed to delete " + HBCK_LOCK_PATH, ioe);
459 try {
460 retryCounter.sleepUntilNextRetry();
461 } catch (InterruptedException ie) {
462 Thread.currentThread().interrupt();
463 LOG.warn("Interrupted while deleting lock file" +
464 HBCK_LOCK_PATH);
465 return;
466 }
467 }
468 } while (retryCounter.shouldRetry());
469 }
470 }
471
472
473
474
475
476 public void connect() throws IOException {
477
478
479 hbckOutFd = checkAndMarkRunningHbck();
480 if (hbckOutFd == null) {
481 setRetCode(-1);
482 LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
483 " no other instance is running, delete the lock file " +
484 HBCK_LOCK_PATH + " and rerun the tool]");
485 throw new IOException("Duplicate hbck - Abort");
486 }
487
488
489 hbckLockCleanup.set(true);
490
491
492
493
494 Runtime.getRuntime().addShutdownHook(new Thread() {
495 @Override
496 public void run() {
497 IOUtils.closeStream(HBaseFsck.this);
498 unlockHbck();
499 }
500 });
501
502 LOG.info("Launching hbck");
503
504 connection = (ClusterConnection)ConnectionFactory.createConnection(getConf());
505 admin = connection.getAdmin();
506 meta = connection.getTable(TableName.META_TABLE_NAME);
507 status = admin.getClusterStatus();
508 }
509
510
511
512
513 private void loadDeployedRegions() throws IOException, InterruptedException {
514
515 Collection<ServerName> regionServers = status.getServers();
516 errors.print("Number of live region servers: " + regionServers.size());
517 if (details) {
518 for (ServerName rsinfo: regionServers) {
519 errors.print(" " + rsinfo.getServerName());
520 }
521 }
522
523
524 Collection<ServerName> deadRegionServers = status.getDeadServerNames();
525 errors.print("Number of dead region servers: " + deadRegionServers.size());
526 if (details) {
527 for (ServerName name: deadRegionServers) {
528 errors.print(" " + name);
529 }
530 }
531
532
533 errors.print("Master: " + status.getMaster());
534
535
536 Collection<ServerName> backupMasters = status.getBackupMasters();
537 errors.print("Number of backup masters: " + backupMasters.size());
538 if (details) {
539 for (ServerName name: backupMasters) {
540 errors.print(" " + name);
541 }
542 }
543
544 errors.print("Average load: " + status.getAverageLoad());
545 errors.print("Number of requests: " + status.getRequestsCount());
546 errors.print("Number of regions: " + status.getRegionsCount());
547
548 Map<String, RegionState> rits = status.getRegionsInTransition();
549 errors.print("Number of regions in transition: " + rits.size());
550 if (details) {
551 for (RegionState state: rits.values()) {
552 errors.print(" " + state.toDescriptiveString());
553 }
554 }
555
556
557 processRegionServers(regionServers);
558 }
559
560
561
562
563 private void clearState() {
564
565 fixes = 0;
566 regionInfoMap.clear();
567 emptyRegionInfoQualifiers.clear();
568 disabledTables.clear();
569 errors.clear();
570 tablesInfo.clear();
571 orphanHdfsDirs.clear();
572 skippedRegions.clear();
573 }
574
575
576
577
578
579
580 public void offlineHdfsIntegrityRepair() throws IOException, InterruptedException {
581
582 if (shouldCheckHdfs() && (shouldFixHdfsOrphans() || shouldFixHdfsHoles()
583 || shouldFixHdfsOverlaps() || shouldFixTableOrphans())) {
584 LOG.info("Loading regioninfos HDFS");
585
586 int maxIterations = getConf().getInt("hbase.hbck.integrityrepair.iterations.max", 3);
587 int curIter = 0;
588 do {
589 clearState();
590
591 restoreHdfsIntegrity();
592 curIter++;
593 } while (fixes > 0 && curIter <= maxIterations);
594
595
596
597 if (curIter > 2) {
598 if (curIter == maxIterations) {
599 LOG.warn("Exiting integrity repairs after max " + curIter + " iterations. "
600 + "Tables integrity may not be fully repaired!");
601 } else {
602 LOG.info("Successfully exiting integrity repairs after " + curIter + " iterations");
603 }
604 }
605 }
606 }
607
608
609
610
611
612
613
614
615
616 public int onlineConsistencyRepair() throws IOException, KeeperException,
617 InterruptedException {
618 clearState();
619
620
621 loadDeployedRegions();
622
623 recordMetaRegion();
624
625 if (!checkMetaRegion()) {
626 String errorMsg = "hbase:meta table is not consistent. ";
627 if (shouldFixAssignments()) {
628 errorMsg += "HBCK will try fixing it. Rerun once hbase:meta is back to consistent state.";
629 } else {
630 errorMsg += "Run HBCK with proper fix options to fix hbase:meta inconsistency.";
631 }
632 errors.reportError(errorMsg + " Exiting...");
633 return -2;
634 }
635
636 LOG.info("Loading regionsinfo from the hbase:meta table");
637 boolean success = loadMetaEntries();
638 if (!success) return -1;
639
640
641 reportEmptyMetaCells();
642
643
644 if (shouldFixEmptyMetaCells()) {
645 fixEmptyMetaCells();
646 }
647
648
649 if (!checkMetaOnly) {
650 reportTablesInFlux();
651 }
652
653
654 if (shouldCheckHdfs()) {
655 LOG.info("Loading region directories from HDFS");
656 loadHdfsRegionDirs();
657 LOG.info("Loading region information from HDFS");
658 loadHdfsRegionInfos();
659 }
660
661
662 loadDisabledTables();
663
664
665 fixOrphanTables();
666
667 LOG.info("Checking and fixing region consistency");
668
669
670 checkAndFixConsistency();
671
672
673 checkIntegrity();
674 return errors.getErrorList().size();
675 }
676
677
678
679
680
681 public int onlineHbck() throws IOException, KeeperException, InterruptedException, ServiceException {
682
683 errors.print("Version: " + status.getHBaseVersion());
684 offlineHdfsIntegrityRepair();
685
686
687 boolean oldBalancer = admin.setBalancerRunning(false, true);
688 try {
689 onlineConsistencyRepair();
690 }
691 finally {
692 admin.setBalancerRunning(oldBalancer, false);
693 }
694
695 if (checkRegionBoundaries) {
696 checkRegionBoundaries();
697 }
698
699 offlineReferenceFileRepair();
700
701 checkAndFixTableLocks();
702
703
704 checkAndFixOrphanedTableZNodes();
705
706
707 unlockHbck();
708
709
710 printTableSummary(tablesInfo);
711 return errors.summarize();
712 }
713
714 public static byte[] keyOnly (byte[] b) {
715 if (b == null)
716 return b;
717 int rowlength = Bytes.toShort(b, 0);
718 byte[] result = new byte[rowlength];
719 System.arraycopy(b, Bytes.SIZEOF_SHORT, result, 0, rowlength);
720 return result;
721 }
722
723 @Override
724 public void close() throws IOException {
725 IOUtils.cleanup(null, admin, meta, connection);
726 }
727
728 private static class RegionBoundariesInformation {
729 public byte [] regionName;
730 public byte [] metaFirstKey;
731 public byte [] metaLastKey;
732 public byte [] storesFirstKey;
733 public byte [] storesLastKey;
734 @Override
735 public String toString () {
736 return "regionName=" + Bytes.toStringBinary(regionName) +
737 "\nmetaFirstKey=" + Bytes.toStringBinary(metaFirstKey) +
738 "\nmetaLastKey=" + Bytes.toStringBinary(metaLastKey) +
739 "\nstoresFirstKey=" + Bytes.toStringBinary(storesFirstKey) +
740 "\nstoresLastKey=" + Bytes.toStringBinary(storesLastKey);
741 }
742 }
743
744 public void checkRegionBoundaries() {
745 try {
746 ByteArrayComparator comparator = new ByteArrayComparator();
747 List<HRegionInfo> regions = MetaScanner.listAllRegions(getConf(), connection, false);
748 final RegionBoundariesInformation currentRegionBoundariesInformation =
749 new RegionBoundariesInformation();
750 Path hbaseRoot = FSUtils.getRootDir(getConf());
751 for (HRegionInfo regionInfo : regions) {
752 Path tableDir = FSUtils.getTableDir(hbaseRoot, regionInfo.getTable());
753 currentRegionBoundariesInformation.regionName = regionInfo.getRegionName();
754
755
756 Path path = new Path(tableDir, regionInfo.getEncodedName());
757 FileSystem fs = path.getFileSystem(getConf());
758 FileStatus[] files = fs.listStatus(path);
759
760 byte[] storeFirstKey = null;
761 byte[] storeLastKey = null;
762 for (FileStatus file : files) {
763 String fileName = file.getPath().toString();
764 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
765 if (!fileName.startsWith(".") && !fileName.endsWith("recovered.edits")) {
766 FileStatus[] storeFiles = fs.listStatus(file.getPath());
767
768 for (FileStatus storeFile : storeFiles) {
769 HFile.Reader reader = HFile.createReader(fs, storeFile.getPath(), new CacheConfig(
770 getConf()), getConf());
771 if ((reader.getFirstKey() != null)
772 && ((storeFirstKey == null) || (comparator.compare(storeFirstKey,
773 reader.getFirstKey()) > 0))) {
774 storeFirstKey = reader.getFirstKey();
775 }
776 if ((reader.getLastKey() != null)
777 && ((storeLastKey == null) || (comparator.compare(storeLastKey,
778 reader.getLastKey())) < 0)) {
779 storeLastKey = reader.getLastKey();
780 }
781 reader.close();
782 }
783 }
784 }
785 currentRegionBoundariesInformation.metaFirstKey = regionInfo.getStartKey();
786 currentRegionBoundariesInformation.metaLastKey = regionInfo.getEndKey();
787 currentRegionBoundariesInformation.storesFirstKey = keyOnly(storeFirstKey);
788 currentRegionBoundariesInformation.storesLastKey = keyOnly(storeLastKey);
789 if (currentRegionBoundariesInformation.metaFirstKey.length == 0)
790 currentRegionBoundariesInformation.metaFirstKey = null;
791 if (currentRegionBoundariesInformation.metaLastKey.length == 0)
792 currentRegionBoundariesInformation.metaLastKey = null;
793
794
795
796
797
798
799 boolean valid = true;
800
801 if ((currentRegionBoundariesInformation.storesFirstKey != null)
802 && (currentRegionBoundariesInformation.metaFirstKey != null)) {
803 valid = valid
804 && comparator.compare(currentRegionBoundariesInformation.storesFirstKey,
805 currentRegionBoundariesInformation.metaFirstKey) >= 0;
806 }
807
808 if ((currentRegionBoundariesInformation.storesLastKey != null)
809 && (currentRegionBoundariesInformation.metaLastKey != null)) {
810 valid = valid
811 && comparator.compare(currentRegionBoundariesInformation.storesLastKey,
812 currentRegionBoundariesInformation.metaLastKey) < 0;
813 }
814 if (!valid) {
815 errors.reportError(ERROR_CODE.BOUNDARIES_ERROR, "Found issues with regions boundaries",
816 tablesInfo.get(regionInfo.getTable()));
817 LOG.warn("Region's boundaries not alligned between stores and META for:");
818 LOG.warn(currentRegionBoundariesInformation);
819 }
820 }
821 } catch (IOException e) {
822 LOG.error(e);
823 }
824 }
825
826
827
828
829 private void adoptHdfsOrphans(Collection<HbckInfo> orphanHdfsDirs) throws IOException {
830 for (HbckInfo hi : orphanHdfsDirs) {
831 LOG.info("Attempting to handle orphan hdfs dir: " + hi.getHdfsRegionDir());
832 adoptHdfsOrphan(hi);
833 }
834 }
835
836
837
838
839
840
841
842
843
844
845 @SuppressWarnings("deprecation")
846 private void adoptHdfsOrphan(HbckInfo hi) throws IOException {
847 Path p = hi.getHdfsRegionDir();
848 FileSystem fs = p.getFileSystem(getConf());
849 FileStatus[] dirs = fs.listStatus(p);
850 if (dirs == null) {
851 LOG.warn("Attempt to adopt ophan hdfs region skipped becuase no files present in " +
852 p + ". This dir could probably be deleted.");
853 return ;
854 }
855
856 TableName tableName = hi.getTableName();
857 TableInfo tableInfo = tablesInfo.get(tableName);
858 Preconditions.checkNotNull(tableInfo, "Table '" + tableName + "' not present!");
859 HTableDescriptor template = tableInfo.getHTD();
860
861
862 Pair<byte[],byte[]> orphanRegionRange = null;
863 for (FileStatus cf : dirs) {
864 String cfName= cf.getPath().getName();
865
866 if (cfName.startsWith(".") || cfName.equals(HConstants.SPLIT_LOGDIR_NAME)) continue;
867
868 FileStatus[] hfiles = fs.listStatus(cf.getPath());
869 for (FileStatus hfile : hfiles) {
870 byte[] start, end;
871 HFile.Reader hf = null;
872 try {
873 CacheConfig cacheConf = new CacheConfig(getConf());
874 hf = HFile.createReader(fs, hfile.getPath(), cacheConf, getConf());
875 hf.loadFileInfo();
876 KeyValue startKv = KeyValue.createKeyValueFromKey(hf.getFirstKey());
877 start = startKv.getRow();
878 KeyValue endKv = KeyValue.createKeyValueFromKey(hf.getLastKey());
879 end = endKv.getRow();
880 } catch (IOException ioe) {
881 LOG.warn("Problem reading orphan file " + hfile + ", skipping");
882 continue;
883 } catch (NullPointerException ioe) {
884 LOG.warn("Orphan file " + hfile + " is possibly corrupted HFile, skipping");
885 continue;
886 } finally {
887 if (hf != null) {
888 hf.close();
889 }
890 }
891
892
893 if (orphanRegionRange == null) {
894
895 orphanRegionRange = new Pair<byte[], byte[]>(start, end);
896 } else {
897
898
899
900 if (Bytes.compareTo(orphanRegionRange.getFirst(), start) > 0) {
901 orphanRegionRange.setFirst(start);
902 }
903 if (Bytes.compareTo(orphanRegionRange.getSecond(), end) < 0 ) {
904 orphanRegionRange.setSecond(end);
905 }
906 }
907 }
908 }
909 if (orphanRegionRange == null) {
910 LOG.warn("No data in dir " + p + ", sidelining data");
911 fixes++;
912 sidelineRegionDir(fs, hi);
913 return;
914 }
915 LOG.info("Min max keys are : [" + Bytes.toString(orphanRegionRange.getFirst()) + ", " +
916 Bytes.toString(orphanRegionRange.getSecond()) + ")");
917
918
919 HRegionInfo hri = new HRegionInfo(template.getTableName(), orphanRegionRange.getFirst(),
920 Bytes.add(orphanRegionRange.getSecond(), new byte[1]));
921 LOG.info("Creating new region : " + hri);
922 HRegion region = HBaseFsckRepair.createHDFSRegionDir(getConf(), hri, template);
923 Path target = region.getRegionFileSystem().getRegionDir();
924
925
926 mergeRegionDirs(target, hi);
927 fixes++;
928 }
929
930
931
932
933
934
935
936
937
938 private int restoreHdfsIntegrity() throws IOException, InterruptedException {
939
940 LOG.info("Loading HBase regioninfo from HDFS...");
941 loadHdfsRegionDirs();
942
943 int errs = errors.getErrorList().size();
944
945 tablesInfo = loadHdfsRegionInfos();
946 checkHdfsIntegrity(false, false);
947
948 if (errors.getErrorList().size() == errs) {
949 LOG.info("No integrity errors. We are done with this phase. Glorious.");
950 return 0;
951 }
952
953 if (shouldFixHdfsOrphans() && orphanHdfsDirs.size() > 0) {
954 adoptHdfsOrphans(orphanHdfsDirs);
955
956 }
957
958
959 if (shouldFixHdfsHoles()) {
960 clearState();
961 loadHdfsRegionDirs();
962 tablesInfo = loadHdfsRegionInfos();
963 tablesInfo = checkHdfsIntegrity(shouldFixHdfsHoles(), false);
964 }
965
966
967 if (shouldFixHdfsOverlaps()) {
968
969 clearState();
970 loadHdfsRegionDirs();
971 tablesInfo = loadHdfsRegionInfos();
972 tablesInfo = checkHdfsIntegrity(false, shouldFixHdfsOverlaps());
973 }
974
975 return errors.getErrorList().size();
976 }
977
978
979
980
981
982
983
984
985
986 private void offlineReferenceFileRepair() throws IOException {
987 Configuration conf = getConf();
988 Path hbaseRoot = FSUtils.getRootDir(conf);
989 FileSystem fs = hbaseRoot.getFileSystem(conf);
990 LOG.info("Computing mapping of all store files");
991 Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot, errors);
992 errors.print("");
993 LOG.info("Validating mapping using HDFS state");
994 for (Path path: allFiles.values()) {
995 boolean isReference = false;
996 try {
997 isReference = StoreFileInfo.isReference(path);
998 } catch (Throwable t) {
999
1000
1001
1002
1003 }
1004 if (!isReference) continue;
1005
1006 Path referredToFile = StoreFileInfo.getReferredToFile(path);
1007 if (fs.exists(referredToFile)) continue;
1008
1009
1010 errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
1011 "Found lingering reference file " + path);
1012 if (!shouldFixReferenceFiles()) continue;
1013
1014
1015 boolean success = false;
1016 String pathStr = path.toString();
1017
1018
1019
1020
1021
1022 int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
1023 for (int i = 0; index > 0 && i < 5; i++) {
1024 index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index - 1);
1025 }
1026 if (index > 0) {
1027 Path rootDir = getSidelineDir();
1028 Path dst = new Path(rootDir, pathStr.substring(index + 1));
1029 fs.mkdirs(dst.getParent());
1030 LOG.info("Trying to sildeline reference file "
1031 + path + " to " + dst);
1032 setShouldRerun();
1033
1034 success = fs.rename(path, dst);
1035 }
1036 if (!success) {
1037 LOG.error("Failed to sideline reference file " + path);
1038 }
1039 }
1040 }
1041
1042
1043
1044
1045 private void reportEmptyMetaCells() {
1046 errors.print("Number of empty REGIONINFO_QUALIFIER rows in hbase:meta: " +
1047 emptyRegionInfoQualifiers.size());
1048 if (details) {
1049 for (Result r: emptyRegionInfoQualifiers) {
1050 errors.print(" " + r);
1051 }
1052 }
1053 }
1054
1055
1056
1057
1058 private void reportTablesInFlux() {
1059 AtomicInteger numSkipped = new AtomicInteger(0);
1060 HTableDescriptor[] allTables = getTables(numSkipped);
1061 errors.print("Number of Tables: " + allTables.length);
1062 if (details) {
1063 if (numSkipped.get() > 0) {
1064 errors.detail("Number of Tables in flux: " + numSkipped.get());
1065 }
1066 for (HTableDescriptor td : allTables) {
1067 errors.detail(" Table: " + td.getTableName() + "\t" +
1068 (td.isReadOnly() ? "ro" : "rw") + "\t" +
1069 (td.isMetaRegion() ? "META" : " ") + "\t" +
1070 " families: " + td.getFamilies().size());
1071 }
1072 }
1073 }
1074
1075 public ErrorReporter getErrors() {
1076 return errors;
1077 }
1078
1079
1080
1081
1082
1083 private void loadHdfsRegioninfo(HbckInfo hbi) throws IOException {
1084 Path regionDir = hbi.getHdfsRegionDir();
1085 if (regionDir == null) {
1086 if (hbi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
1087
1088 LOG.warn("No HDFS region dir found: " + hbi + " meta=" + hbi.metaEntry);
1089 }
1090 return;
1091 }
1092
1093 if (hbi.hdfsEntry.hri != null) {
1094
1095 return;
1096 }
1097
1098 FileSystem fs = FileSystem.get(getConf());
1099 HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir);
1100 LOG.debug("HRegionInfo read: " + hri.toString());
1101 hbi.hdfsEntry.hri = hri;
1102 }
1103
1104
1105
1106
1107
1108 public static class RegionRepairException extends IOException {
1109 private static final long serialVersionUID = 1L;
1110 final IOException ioe;
1111 public RegionRepairException(String s, IOException ioe) {
1112 super(s);
1113 this.ioe = ioe;
1114 }
1115 }
1116
1117
1118
1119
1120 private SortedMap<TableName, TableInfo> loadHdfsRegionInfos()
1121 throws IOException, InterruptedException {
1122 tablesInfo.clear();
1123
1124 Collection<HbckInfo> hbckInfos = regionInfoMap.values();
1125
1126
1127 List<WorkItemHdfsRegionInfo> hbis = new ArrayList<WorkItemHdfsRegionInfo>(hbckInfos.size());
1128 List<Future<Void>> hbiFutures;
1129
1130 for (HbckInfo hbi : hbckInfos) {
1131 WorkItemHdfsRegionInfo work = new WorkItemHdfsRegionInfo(hbi, this, errors);
1132 hbis.add(work);
1133 }
1134
1135
1136 hbiFutures = executor.invokeAll(hbis);
1137
1138 for(int i=0; i<hbiFutures.size(); i++) {
1139 WorkItemHdfsRegionInfo work = hbis.get(i);
1140 Future<Void> f = hbiFutures.get(i);
1141 try {
1142 f.get();
1143 } catch(ExecutionException e) {
1144 LOG.warn("Failed to read .regioninfo file for region " +
1145 work.hbi.getRegionNameAsString(), e.getCause());
1146 }
1147 }
1148
1149 Path hbaseRoot = FSUtils.getRootDir(getConf());
1150 FileSystem fs = hbaseRoot.getFileSystem(getConf());
1151
1152 for (HbckInfo hbi: hbckInfos) {
1153
1154 if (hbi.getHdfsHRI() == null) {
1155
1156 continue;
1157 }
1158
1159
1160
1161 TableName tableName = hbi.getTableName();
1162 if (tableName == null) {
1163
1164 LOG.warn("tableName was null for: " + hbi);
1165 continue;
1166 }
1167
1168 TableInfo modTInfo = tablesInfo.get(tableName);
1169 if (modTInfo == null) {
1170
1171 modTInfo = new TableInfo(tableName);
1172 tablesInfo.put(tableName, modTInfo);
1173 try {
1174 HTableDescriptor htd =
1175 FSTableDescriptors.getTableDescriptorFromFs(fs, hbaseRoot, tableName);
1176 modTInfo.htds.add(htd);
1177 } catch (IOException ioe) {
1178 if (!orphanTableDirs.containsKey(tableName)) {
1179 LOG.warn("Unable to read .tableinfo from " + hbaseRoot, ioe);
1180
1181 errors.reportError(ERROR_CODE.NO_TABLEINFO_FILE,
1182 "Unable to read .tableinfo from " + hbaseRoot + "/" + tableName);
1183 Set<String> columns = new HashSet<String>();
1184 orphanTableDirs.put(tableName, getColumnFamilyList(columns, hbi));
1185 }
1186 }
1187 }
1188 if (!hbi.isSkipChecks()) {
1189 modTInfo.addRegionInfo(hbi);
1190 }
1191 }
1192
1193 loadTableInfosForTablesWithNoRegion();
1194 errors.print("");
1195
1196 return tablesInfo;
1197 }
1198
1199
1200
1201
1202
1203
1204
1205
1206 private Set<String> getColumnFamilyList(Set<String> columns, HbckInfo hbi) throws IOException {
1207 Path regionDir = hbi.getHdfsRegionDir();
1208 FileSystem fs = regionDir.getFileSystem(getConf());
1209 FileStatus[] subDirs = fs.listStatus(regionDir, new FSUtils.FamilyDirFilter(fs));
1210 for (FileStatus subdir : subDirs) {
1211 String columnfamily = subdir.getPath().getName();
1212 columns.add(columnfamily);
1213 }
1214 return columns;
1215 }
1216
1217
1218
1219
1220
1221
1222
1223
1224 private boolean fabricateTableInfo(FSTableDescriptors fstd, TableName tableName,
1225 Set<String> columns) throws IOException {
1226 if (columns ==null || columns.isEmpty()) return false;
1227 HTableDescriptor htd = new HTableDescriptor(tableName);
1228 for (String columnfamimly : columns) {
1229 htd.addFamily(new HColumnDescriptor(columnfamimly));
1230 }
1231 fstd.createTableDescriptor(htd, true);
1232 return true;
1233 }
1234
1235
1236
1237
1238
1239 public void fixEmptyMetaCells() throws IOException {
1240 if (shouldFixEmptyMetaCells() && !emptyRegionInfoQualifiers.isEmpty()) {
1241 LOG.info("Trying to fix empty REGIONINFO_QUALIFIER hbase:meta rows.");
1242 for (Result region : emptyRegionInfoQualifiers) {
1243 deleteMetaRegion(region.getRow());
1244 errors.getErrorList().remove(ERROR_CODE.EMPTY_META_CELL);
1245 }
1246 emptyRegionInfoQualifiers.clear();
1247 }
1248 }
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259 public void fixOrphanTables() throws IOException {
1260 if (shouldFixTableOrphans() && !orphanTableDirs.isEmpty()) {
1261
1262 List<TableName> tmpList = new ArrayList<TableName>();
1263 tmpList.addAll(orphanTableDirs.keySet());
1264 HTableDescriptor[] htds = getHTableDescriptors(tmpList);
1265 Iterator<Entry<TableName, Set<String>>> iter =
1266 orphanTableDirs.entrySet().iterator();
1267 int j = 0;
1268 int numFailedCase = 0;
1269 FSTableDescriptors fstd = new FSTableDescriptors(getConf());
1270 while (iter.hasNext()) {
1271 Entry<TableName, Set<String>> entry =
1272 iter.next();
1273 TableName tableName = entry.getKey();
1274 LOG.info("Trying to fix orphan table error: " + tableName);
1275 if (j < htds.length) {
1276 if (tableName.equals(htds[j].getTableName())) {
1277 HTableDescriptor htd = htds[j];
1278 LOG.info("fixing orphan table: " + tableName + " from cache");
1279 fstd.createTableDescriptor(htd, true);
1280 j++;
1281 iter.remove();
1282 }
1283 } else {
1284 if (fabricateTableInfo(fstd, tableName, entry.getValue())) {
1285 LOG.warn("fixing orphan table: " + tableName + " with a default .tableinfo file");
1286 LOG.warn("Strongly recommend to modify the HTableDescriptor if necessary for: " + tableName);
1287 iter.remove();
1288 } else {
1289 LOG.error("Unable to create default .tableinfo for " + tableName + " while missing column family information");
1290 numFailedCase++;
1291 }
1292 }
1293 fixes++;
1294 }
1295
1296 if (orphanTableDirs.isEmpty()) {
1297
1298
1299 setShouldRerun();
1300 LOG.warn("Strongly recommend to re-run manually hfsck after all orphanTableDirs being fixed");
1301 } else if (numFailedCase > 0) {
1302 LOG.error("Failed to fix " + numFailedCase
1303 + " OrphanTables with default .tableinfo files");
1304 }
1305
1306 }
1307
1308 orphanTableDirs.clear();
1309
1310 }
1311
1312
1313
1314
1315
1316
1317 private HRegion createNewMeta() throws IOException {
1318 Path rootdir = FSUtils.getRootDir(getConf());
1319 Configuration c = getConf();
1320 HRegionInfo metaHRI = new HRegionInfo(HRegionInfo.FIRST_META_REGIONINFO);
1321 HTableDescriptor metaDescriptor = new FSTableDescriptors(c).get(TableName.META_TABLE_NAME);
1322 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, false);
1323 HRegion meta = HRegion.createHRegion(metaHRI, rootdir, c, metaDescriptor);
1324 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, true);
1325 return meta;
1326 }
1327
1328
1329
1330
1331
1332
1333
1334 private ArrayList<Put> generatePuts(
1335 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1336 ArrayList<Put> puts = new ArrayList<Put>();
1337 boolean hasProblems = false;
1338 for (Entry<TableName, TableInfo> e : tablesInfo.entrySet()) {
1339 TableName name = e.getKey();
1340
1341
1342 if (name.compareTo(TableName.META_TABLE_NAME) == 0) {
1343 continue;
1344 }
1345
1346 TableInfo ti = e.getValue();
1347 for (Entry<byte[], Collection<HbckInfo>> spl : ti.sc.getStarts().asMap()
1348 .entrySet()) {
1349 Collection<HbckInfo> his = spl.getValue();
1350 int sz = his.size();
1351 if (sz != 1) {
1352
1353 LOG.error("Split starting at " + Bytes.toStringBinary(spl.getKey())
1354 + " had " + sz + " regions instead of exactly 1." );
1355 hasProblems = true;
1356 continue;
1357 }
1358
1359
1360 HbckInfo hi = his.iterator().next();
1361 HRegionInfo hri = hi.getHdfsHRI();
1362 Put p = MetaTableAccessor.makePutFromRegionInfo(hri);
1363 puts.add(p);
1364 }
1365 }
1366 return hasProblems ? null : puts;
1367 }
1368
1369
1370
1371
1372 private void suggestFixes(
1373 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1374 logParallelMerge();
1375 for (TableInfo tInfo : tablesInfo.values()) {
1376 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1377 tInfo.checkRegionChain(handler);
1378 }
1379 }
1380
1381
1382
1383
1384
1385
1386
1387
1388 public boolean rebuildMeta(boolean fix) throws IOException,
1389 InterruptedException {
1390
1391
1392
1393
1394
1395 LOG.info("Loading HBase regioninfo from HDFS...");
1396 loadHdfsRegionDirs();
1397
1398 int errs = errors.getErrorList().size();
1399 tablesInfo = loadHdfsRegionInfos();
1400 checkHdfsIntegrity(false, false);
1401
1402
1403 if (errors.getErrorList().size() != errs) {
1404
1405 while(true) {
1406 fixes = 0;
1407 suggestFixes(tablesInfo);
1408 errors.clear();
1409 loadHdfsRegionInfos();
1410 checkHdfsIntegrity(shouldFixHdfsHoles(), shouldFixHdfsOverlaps());
1411
1412 int errCount = errors.getErrorList().size();
1413
1414 if (fixes == 0) {
1415 if (errCount > 0) {
1416 return false;
1417 } else {
1418 break;
1419 }
1420 }
1421 }
1422 }
1423
1424
1425 LOG.info("HDFS regioninfo's seems good. Sidelining old hbase:meta");
1426 Path backupDir = sidelineOldMeta();
1427
1428 LOG.info("Creating new hbase:meta");
1429 HRegion meta = createNewMeta();
1430
1431
1432 List<Put> puts = generatePuts(tablesInfo);
1433 if (puts == null) {
1434 LOG.fatal("Problem encountered when creating new hbase:meta entries. " +
1435 "You may need to restore the previously sidelined hbase:meta");
1436 return false;
1437 }
1438 meta.batchMutate(puts.toArray(new Put[puts.size()]));
1439 HRegion.closeHRegion(meta);
1440 LOG.info("Success! hbase:meta table rebuilt.");
1441 LOG.info("Old hbase:meta is moved into " + backupDir);
1442 return true;
1443 }
1444
1445
1446
1447
1448 private void logParallelMerge() {
1449 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
1450 LOG.info("Handling overlap merges in parallel. set hbasefsck.overlap.merge.parallel to" +
1451 " false to run serially.");
1452 } else {
1453 LOG.info("Handling overlap merges serially. set hbasefsck.overlap.merge.parallel to" +
1454 " true to run in parallel.");
1455 }
1456 }
1457
1458 private SortedMap<TableName, TableInfo> checkHdfsIntegrity(boolean fixHoles,
1459 boolean fixOverlaps) throws IOException {
1460 LOG.info("Checking HBase region split map from HDFS data...");
1461 logParallelMerge();
1462 for (TableInfo tInfo : tablesInfo.values()) {
1463 TableIntegrityErrorHandler handler;
1464 if (fixHoles || fixOverlaps) {
1465 handler = tInfo.new HDFSIntegrityFixer(tInfo, errors, getConf(),
1466 fixHoles, fixOverlaps);
1467 } else {
1468 handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1469 }
1470 if (!tInfo.checkRegionChain(handler)) {
1471
1472 errors.report("Found inconsistency in table " + tInfo.getName());
1473 }
1474 }
1475 return tablesInfo;
1476 }
1477
1478 private Path getSidelineDir() throws IOException {
1479 if (sidelineDir == null) {
1480 Path hbaseDir = FSUtils.getRootDir(getConf());
1481 Path hbckDir = new Path(hbaseDir, HConstants.HBCK_SIDELINEDIR_NAME);
1482 sidelineDir = new Path(hbckDir, hbaseDir.getName() + "-"
1483 + startMillis);
1484 }
1485 return sidelineDir;
1486 }
1487
1488
1489
1490
1491 Path sidelineRegionDir(FileSystem fs, HbckInfo hi) throws IOException {
1492 return sidelineRegionDir(fs, null, hi);
1493 }
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503 Path sidelineRegionDir(FileSystem fs,
1504 String parentDir, HbckInfo hi) throws IOException {
1505 TableName tableName = hi.getTableName();
1506 Path regionDir = hi.getHdfsRegionDir();
1507
1508 if (!fs.exists(regionDir)) {
1509 LOG.warn("No previous " + regionDir + " exists. Continuing.");
1510 return null;
1511 }
1512
1513 Path rootDir = getSidelineDir();
1514 if (parentDir != null) {
1515 rootDir = new Path(rootDir, parentDir);
1516 }
1517 Path sidelineTableDir= FSUtils.getTableDir(rootDir, tableName);
1518 Path sidelineRegionDir = new Path(sidelineTableDir, regionDir.getName());
1519 fs.mkdirs(sidelineRegionDir);
1520 boolean success = false;
1521 FileStatus[] cfs = fs.listStatus(regionDir);
1522 if (cfs == null) {
1523 LOG.info("Region dir is empty: " + regionDir);
1524 } else {
1525 for (FileStatus cf : cfs) {
1526 Path src = cf.getPath();
1527 Path dst = new Path(sidelineRegionDir, src.getName());
1528 if (fs.isFile(src)) {
1529
1530 success = fs.rename(src, dst);
1531 if (!success) {
1532 String msg = "Unable to rename file " + src + " to " + dst;
1533 LOG.error(msg);
1534 throw new IOException(msg);
1535 }
1536 continue;
1537 }
1538
1539
1540 fs.mkdirs(dst);
1541
1542 LOG.info("Sidelining files from " + src + " into containing region " + dst);
1543
1544
1545
1546
1547 FileStatus[] hfiles = fs.listStatus(src);
1548 if (hfiles != null && hfiles.length > 0) {
1549 for (FileStatus hfile : hfiles) {
1550 success = fs.rename(hfile.getPath(), dst);
1551 if (!success) {
1552 String msg = "Unable to rename file " + src + " to " + dst;
1553 LOG.error(msg);
1554 throw new IOException(msg);
1555 }
1556 }
1557 }
1558 LOG.debug("Sideline directory contents:");
1559 debugLsr(sidelineRegionDir);
1560 }
1561 }
1562
1563 LOG.info("Removing old region dir: " + regionDir);
1564 success = fs.delete(regionDir, true);
1565 if (!success) {
1566 String msg = "Unable to delete dir " + regionDir;
1567 LOG.error(msg);
1568 throw new IOException(msg);
1569 }
1570 return sidelineRegionDir;
1571 }
1572
1573
1574
1575
1576 void sidelineTable(FileSystem fs, TableName tableName, Path hbaseDir,
1577 Path backupHbaseDir) throws IOException {
1578 Path tableDir = FSUtils.getTableDir(hbaseDir, tableName);
1579 if (fs.exists(tableDir)) {
1580 Path backupTableDir= FSUtils.getTableDir(backupHbaseDir, tableName);
1581 fs.mkdirs(backupTableDir.getParent());
1582 boolean success = fs.rename(tableDir, backupTableDir);
1583 if (!success) {
1584 throw new IOException("Failed to move " + tableName + " from "
1585 + tableDir + " to " + backupTableDir);
1586 }
1587 } else {
1588 LOG.info("No previous " + tableName + " exists. Continuing.");
1589 }
1590 }
1591
1592
1593
1594
1595 Path sidelineOldMeta() throws IOException {
1596
1597 Path hbaseDir = FSUtils.getRootDir(getConf());
1598 FileSystem fs = hbaseDir.getFileSystem(getConf());
1599 Path backupDir = getSidelineDir();
1600 fs.mkdirs(backupDir);
1601
1602 try {
1603 sidelineTable(fs, TableName.META_TABLE_NAME, hbaseDir, backupDir);
1604 } catch (IOException e) {
1605 LOG.fatal("... failed to sideline meta. Currently in inconsistent state. To restore "
1606 + "try to rename hbase:meta in " + backupDir.getName() + " to "
1607 + hbaseDir.getName() + ".", e);
1608 throw e;
1609 }
1610 return backupDir;
1611 }
1612
1613
1614
1615
1616
1617
1618 private void loadDisabledTables()
1619 throws ZooKeeperConnectionException, IOException {
1620 HConnectionManager.execute(new HConnectable<Void>(getConf()) {
1621 @Override
1622 public Void connect(HConnection connection) throws IOException {
1623 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1624 try {
1625 for (TableName tableName :
1626 ZKTableStateClientSideReader.getDisabledOrDisablingTables(zkw)) {
1627 disabledTables.add(tableName);
1628 }
1629 } catch (KeeperException ke) {
1630 throw new IOException(ke);
1631 } catch (InterruptedException e) {
1632 throw new InterruptedIOException();
1633 } finally {
1634 zkw.close();
1635 }
1636 return null;
1637 }
1638 });
1639 }
1640
1641
1642
1643
1644 private boolean isTableDisabled(HRegionInfo regionInfo) {
1645 return disabledTables.contains(regionInfo.getTable());
1646 }
1647
1648
1649
1650
1651
1652 public void loadHdfsRegionDirs() throws IOException, InterruptedException {
1653 Path rootDir = FSUtils.getRootDir(getConf());
1654 FileSystem fs = rootDir.getFileSystem(getConf());
1655
1656
1657 List<FileStatus> tableDirs = Lists.newArrayList();
1658
1659 boolean foundVersionFile = fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME));
1660
1661 List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
1662 for (Path path : paths) {
1663 TableName tableName = FSUtils.getTableName(path);
1664 if ((!checkMetaOnly &&
1665 isTableIncluded(tableName)) ||
1666 tableName.equals(TableName.META_TABLE_NAME)) {
1667 tableDirs.add(fs.getFileStatus(path));
1668 }
1669 }
1670
1671
1672 if (!foundVersionFile) {
1673 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
1674 "Version file does not exist in root dir " + rootDir);
1675 if (shouldFixVersionFile()) {
1676 LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME
1677 + " file.");
1678 setShouldRerun();
1679 FSUtils.setVersion(fs, rootDir, getConf().getInt(
1680 HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt(
1681 HConstants.VERSION_FILE_WRITE_ATTEMPTS,
1682 HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS));
1683 }
1684 }
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711 private boolean recordMetaRegion() throws IOException {
1712 RegionLocations rl = ((ClusterConnection)connection).locateRegion(TableName.META_TABLE_NAME,
1713 HConstants.EMPTY_START_ROW, false, false);
1714 if (rl == null) {
1715 errors.reportError(ERROR_CODE.NULL_META_REGION,
1716 "META region or some of its attributes are null.");
1717 return false;
1718 }
1719 for (HRegionLocation metaLocation : rl.getRegionLocations()) {
1720
1721 if (metaLocation == null || metaLocation.getRegionInfo() == null ||
1722 metaLocation.getHostname() == null) {
1723 errors.reportError(ERROR_CODE.NULL_META_REGION,
1724 "META region or some of its attributes are null.");
1725 return false;
1726 }
1727 ServerName sn = metaLocation.getServerName();
1728 MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, EnvironmentEdgeManager.currentTime());
1729 HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
1730 if (hbckInfo == null) {
1731 regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
1732 } else {
1733 hbckInfo.metaEntry = m;
1734 }
1735 }
1736 return true;
1737 }
1738
1739 private ZooKeeperWatcher createZooKeeperWatcher() throws IOException {
1740 return new ZooKeeperWatcher(getConf(), "hbase Fsck", new Abortable() {
1741 @Override
1742 public void abort(String why, Throwable e) {
1743 LOG.error(why, e);
1744 System.exit(1);
1745 }
1746
1747 @Override
1748 public boolean isAborted() {
1749 return false;
1750 }
1751
1752 });
1753 }
1754
1755 private ServerName getMetaRegionServerName(int replicaId)
1756 throws IOException, KeeperException {
1757 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1758 ServerName sn = null;
1759 try {
1760 sn = new MetaTableLocator().getMetaRegionLocation(zkw, replicaId);
1761 } finally {
1762 zkw.close();
1763 }
1764 return sn;
1765 }
1766
1767
1768
1769
1770
1771
1772 void processRegionServers(Collection<ServerName> regionServerList)
1773 throws IOException, InterruptedException {
1774
1775 List<WorkItemRegion> workItems = new ArrayList<WorkItemRegion>(regionServerList.size());
1776 List<Future<Void>> workFutures;
1777
1778
1779 for (ServerName rsinfo: regionServerList) {
1780 workItems.add(new WorkItemRegion(this, rsinfo, errors, connection));
1781 }
1782
1783 workFutures = executor.invokeAll(workItems);
1784
1785 for(int i=0; i<workFutures.size(); i++) {
1786 WorkItemRegion item = workItems.get(i);
1787 Future<Void> f = workFutures.get(i);
1788 try {
1789 f.get();
1790 } catch(ExecutionException e) {
1791 LOG.warn("Could not process regionserver " + item.rsinfo.getHostAndPort(),
1792 e.getCause());
1793 }
1794 }
1795 }
1796
1797
1798
1799
1800 private void checkAndFixConsistency()
1801 throws IOException, KeeperException, InterruptedException {
1802
1803
1804 List<CheckRegionConsistencyWorkItem> workItems =
1805 new ArrayList<CheckRegionConsistencyWorkItem>(regionInfoMap.size());
1806 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1807 if (e.getValue().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
1808 workItems.add(new CheckRegionConsistencyWorkItem(e.getKey(), e.getValue()));
1809 }
1810 }
1811 checkRegionConsistencyConcurrently(workItems);
1812
1813 boolean prevHdfsCheck = shouldCheckHdfs();
1814 setCheckHdfs(false);
1815
1816
1817 List<CheckRegionConsistencyWorkItem> replicaWorkItems =
1818 new ArrayList<CheckRegionConsistencyWorkItem>(regionInfoMap.size());
1819 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1820 if (e.getValue().getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
1821 replicaWorkItems.add(new CheckRegionConsistencyWorkItem(e.getKey(), e.getValue()));
1822 }
1823 }
1824 checkRegionConsistencyConcurrently(replicaWorkItems);
1825 setCheckHdfs(prevHdfsCheck);
1826
1827
1828
1829
1830
1831 int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0);
1832 int numOfSkippedRegions = skippedRegions.size();
1833 if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) {
1834 throw new IOException(numOfSkippedRegions
1835 + " region(s) could not be checked or repaired. See logs for detail.");
1836 }
1837 }
1838
1839
1840
1841
1842 private void checkRegionConsistencyConcurrently(
1843 final List<CheckRegionConsistencyWorkItem> workItems)
1844 throws IOException, KeeperException, InterruptedException {
1845 if (workItems.isEmpty()) {
1846 return;
1847 }
1848
1849 List<Future<Void>> workFutures = executor.invokeAll(workItems);
1850 for(Future<Void> f: workFutures) {
1851 try {
1852 f.get();
1853 } catch(ExecutionException e1) {
1854 LOG.warn("Could not check region consistency " , e1.getCause());
1855 if (e1.getCause() instanceof IOException) {
1856 throw (IOException)e1.getCause();
1857 } else if (e1.getCause() instanceof KeeperException) {
1858 throw (KeeperException)e1.getCause();
1859 } else if (e1.getCause() instanceof InterruptedException) {
1860 throw (InterruptedException)e1.getCause();
1861 } else {
1862 throw new IOException(e1.getCause());
1863 }
1864 }
1865 }
1866 }
1867
1868 class CheckRegionConsistencyWorkItem implements Callable<Void> {
1869 private final String key;
1870 private final HbckInfo hbi;
1871
1872 CheckRegionConsistencyWorkItem(String key, HbckInfo hbi) {
1873 this.key = key;
1874 this.hbi = hbi;
1875 }
1876
1877 @Override
1878 public synchronized Void call() throws Exception {
1879 try {
1880 checkRegionConsistency(key, hbi);
1881 } catch (Exception e) {
1882
1883
1884 LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString()
1885 + "'.", e);
1886 if (hbi.getHdfsHRI().isMetaRegion()) {
1887 throw e;
1888 }
1889 LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'");
1890 addSkippedRegion(hbi);
1891 }
1892 return null;
1893 }
1894 }
1895
1896 private void addSkippedRegion(final HbckInfo hbi) {
1897 Set<String> skippedRegionNames = skippedRegions.get(hbi.getTableName());
1898 if (skippedRegionNames == null) {
1899 skippedRegionNames = new HashSet<String>();
1900 }
1901 skippedRegionNames.add(hbi.getRegionNameAsString());
1902 skippedRegions.put(hbi.getTableName(), skippedRegionNames);
1903 }
1904
1905 private void preCheckPermission() throws IOException, AccessDeniedException {
1906 if (shouldIgnorePreCheckPermission()) {
1907 return;
1908 }
1909
1910 Path hbaseDir = FSUtils.getRootDir(getConf());
1911 FileSystem fs = hbaseDir.getFileSystem(getConf());
1912 UserProvider userProvider = UserProvider.instantiate(getConf());
1913 UserGroupInformation ugi = userProvider.getCurrent().getUGI();
1914 FileStatus[] files = fs.listStatus(hbaseDir);
1915 for (FileStatus file : files) {
1916 try {
1917 FSUtils.checkAccess(ugi, file, FsAction.WRITE);
1918 } catch (AccessDeniedException ace) {
1919 LOG.warn("Got AccessDeniedException when preCheckPermission ", ace);
1920 errors.reportError(ERROR_CODE.WRONG_USAGE, "Current user " + ugi.getUserName()
1921 + " does not have write perms to " + file.getPath()
1922 + ". Please rerun hbck as hdfs user " + file.getOwner());
1923 throw ace;
1924 }
1925 }
1926 }
1927
1928
1929
1930
1931 private void deleteMetaRegion(HbckInfo hi) throws IOException {
1932 deleteMetaRegion(hi.metaEntry.getRegionName());
1933 }
1934
1935
1936
1937
1938 private void deleteMetaRegion(byte[] metaKey) throws IOException {
1939 Delete d = new Delete(metaKey);
1940 meta.delete(d);
1941 LOG.info("Deleted " + Bytes.toString(metaKey) + " from META" );
1942 }
1943
1944
1945
1946
1947 private void resetSplitParent(HbckInfo hi) throws IOException {
1948 RowMutations mutations = new RowMutations(hi.metaEntry.getRegionName());
1949 Delete d = new Delete(hi.metaEntry.getRegionName());
1950 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITA_QUALIFIER);
1951 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITB_QUALIFIER);
1952 mutations.add(d);
1953
1954 HRegionInfo hri = new HRegionInfo(hi.metaEntry);
1955 hri.setOffline(false);
1956 hri.setSplit(false);
1957 Put p = MetaTableAccessor.makePutFromRegionInfo(hri);
1958 mutations.add(p);
1959
1960 meta.mutateRow(mutations);
1961 LOG.info("Reset split parent " + hi.metaEntry.getRegionNameAsString() + " in META" );
1962 }
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972 private void offline(byte[] regionName) throws IOException {
1973 String regionString = Bytes.toStringBinary(regionName);
1974 if (!rsSupportsOffline) {
1975 LOG.warn("Using unassign region " + regionString
1976 + " instead of using offline method, you should"
1977 + " restart HMaster after these repairs");
1978 admin.unassign(regionName, true);
1979 return;
1980 }
1981
1982
1983 try {
1984 LOG.info("Offlining region " + regionString);
1985 admin.offline(regionName);
1986 } catch (IOException ioe) {
1987 String notFoundMsg = "java.lang.NoSuchMethodException: " +
1988 "org.apache.hadoop.hbase.master.HMaster.offline([B)";
1989 if (ioe.getMessage().contains(notFoundMsg)) {
1990 LOG.warn("Using unassign region " + regionString
1991 + " instead of using offline method, you should"
1992 + " restart HMaster after these repairs");
1993 rsSupportsOffline = false;
1994 admin.unassign(regionName, true);
1995 return;
1996 }
1997 throw ioe;
1998 }
1999 }
2000
2001 private void undeployRegions(HbckInfo hi) throws IOException, InterruptedException {
2002 undeployRegionsForHbi(hi);
2003
2004 if (hi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
2005 return;
2006 }
2007 int numReplicas = admin.getTableDescriptor(hi.getTableName()).getRegionReplication();
2008 for (int i = 1; i < numReplicas; i++) {
2009 if (hi.getPrimaryHRIForDeployedReplica() == null) continue;
2010 HRegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
2011 hi.getPrimaryHRIForDeployedReplica(), i);
2012 HbckInfo h = regionInfoMap.get(hri.getEncodedName());
2013 if (h != null) {
2014 undeployRegionsForHbi(h);
2015
2016
2017 h.setSkipChecks(true);
2018 }
2019 }
2020 }
2021
2022 private void undeployRegionsForHbi(HbckInfo hi) throws IOException, InterruptedException {
2023 for (OnlineEntry rse : hi.deployedEntries) {
2024 LOG.debug("Undeploy region " + rse.hri + " from " + rse.hsa);
2025 try {
2026 HBaseFsckRepair.closeRegionSilentlyAndWait(connection, rse.hsa, rse.hri);
2027 offline(rse.hri.getRegionName());
2028 } catch (IOException ioe) {
2029 LOG.warn("Got exception when attempting to offline region "
2030 + Bytes.toString(rse.hri.getRegionName()), ioe);
2031 }
2032 }
2033 }
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047 private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
2048 if (hi.metaEntry == null && hi.hdfsEntry == null) {
2049 undeployRegions(hi);
2050 return;
2051 }
2052
2053
2054 Get get = new Get(hi.getRegionName());
2055 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2056 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
2057 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
2058
2059 if (hi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2060 int numReplicas = admin.getTableDescriptor(hi.getTableName()).getRegionReplication();
2061 for (int i = 0; i < numReplicas; i++) {
2062 get.addColumn(HConstants.CATALOG_FAMILY, MetaTableAccessor.getServerColumn(i));
2063 get.addColumn(HConstants.CATALOG_FAMILY, MetaTableAccessor.getStartCodeColumn(i));
2064 }
2065 }
2066 Result r = meta.get(get);
2067 RegionLocations rl = MetaTableAccessor.getRegionLocations(r);
2068 if (rl == null) {
2069 LOG.warn("Unable to close region " + hi.getRegionNameAsString() +
2070 " since meta does not have handle to reach it");
2071 return;
2072 }
2073 for (HRegionLocation h : rl.getRegionLocations()) {
2074 ServerName serverName = h.getServerName();
2075 if (serverName == null) {
2076 errors.reportError("Unable to close region "
2077 + hi.getRegionNameAsString() + " because meta does not "
2078 + "have handle to reach it.");
2079 continue;
2080 }
2081 HRegionInfo hri = h.getRegionInfo();
2082 if (hri == null) {
2083 LOG.warn("Unable to close region " + hi.getRegionNameAsString()
2084 + " because hbase:meta had invalid or missing "
2085 + HConstants.CATALOG_FAMILY_STR + ":"
2086 + Bytes.toString(HConstants.REGIONINFO_QUALIFIER)
2087 + " qualifier value.");
2088 continue;
2089 }
2090
2091 HBaseFsckRepair.closeRegionSilentlyAndWait(connection, serverName, hri);
2092 }
2093 }
2094
2095 private void tryAssignmentRepair(HbckInfo hbi, String msg) throws IOException,
2096 KeeperException, InterruptedException {
2097
2098 if (shouldFixAssignments()) {
2099 errors.print(msg);
2100 undeployRegions(hbi);
2101 setShouldRerun();
2102 HRegionInfo hri = hbi.getHdfsHRI();
2103 if (hri == null) {
2104 hri = hbi.metaEntry;
2105 }
2106 HBaseFsckRepair.fixUnassigned(admin, hri);
2107 HBaseFsckRepair.waitUntilAssigned(admin, hri);
2108
2109
2110 if (hbi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) return;
2111 int replicationCount = admin.getTableDescriptor(hri.getTable()).getRegionReplication();
2112 for (int i = 1; i < replicationCount; i++) {
2113 hri = RegionReplicaUtil.getRegionInfoForReplica(hri, i);
2114 HbckInfo h = regionInfoMap.get(hri.getEncodedName());
2115 if (h != null) {
2116 undeployRegions(h);
2117
2118
2119 h.setSkipChecks(true);
2120 }
2121 HBaseFsckRepair.fixUnassigned(admin, hri);
2122 HBaseFsckRepair.waitUntilAssigned(admin, hri);
2123 }
2124
2125 }
2126 }
2127
2128
2129
2130
2131 private void checkRegionConsistency(final String key, final HbckInfo hbi)
2132 throws IOException, KeeperException, InterruptedException {
2133
2134 if (hbi.isSkipChecks()) return;
2135 String descriptiveName = hbi.toString();
2136 boolean inMeta = hbi.metaEntry != null;
2137
2138 boolean inHdfs = !shouldCheckHdfs() || hbi.getHdfsRegionDir() != null;
2139 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
2140 boolean isDeployed = !hbi.deployedOn.isEmpty();
2141 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
2142 boolean deploymentMatchesMeta =
2143 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
2144 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
2145 boolean splitParent =
2146 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
2147 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
2148 boolean recentlyModified = inHdfs &&
2149 hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTime();
2150
2151
2152 if (hbi.containsOnlyHdfsEdits()) {
2153 return;
2154 }
2155 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
2156 return;
2157 } else if (inMeta && inHdfs && !shouldBeDeployed && !isDeployed) {
2158 LOG.info("Region " + descriptiveName + " is in META, and in a disabled " +
2159 "tabled that is not deployed");
2160 return;
2161 } else if (recentlyModified) {
2162 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
2163 return;
2164 }
2165
2166 else if (!inMeta && !inHdfs && !isDeployed) {
2167
2168 assert false : "Entry for region with no data";
2169 } else if (!inMeta && !inHdfs && isDeployed) {
2170 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
2171 + descriptiveName + ", key=" + key + ", not on HDFS or in hbase:meta but " +
2172 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2173 if (shouldFixAssignments()) {
2174 undeployRegions(hbi);
2175 }
2176
2177 } else if (!inMeta && inHdfs && !isDeployed) {
2178 if (hbi.isMerged()) {
2179
2180
2181 hbi.setSkipChecks(true);
2182 LOG.info("Region " + descriptiveName
2183 + " got merge recently, its file(s) will be cleaned by CatalogJanitor later");
2184 return;
2185 }
2186 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
2187 + descriptiveName + " on HDFS, but not listed in hbase:meta " +
2188 "or deployed on any region server");
2189
2190 if (shouldFixMeta()) {
2191 if (!hbi.isHdfsRegioninfoPresent()) {
2192 LOG.error("Region " + hbi.getHdfsHRI() + " could have been repaired"
2193 + " in table integrity repair phase if -fixHdfsOrphans was" +
2194 " used.");
2195 return;
2196 }
2197
2198 HRegionInfo hri = hbi.getHdfsHRI();
2199 TableInfo tableInfo = tablesInfo.get(hri.getTable());
2200
2201 for (HRegionInfo region : tableInfo.getRegionsFromMeta()) {
2202 if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0
2203 && (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(),
2204 hri.getEndKey()) >= 0)
2205 && Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) {
2206 if(region.isSplit() || region.isOffline()) continue;
2207 Path regionDir = hbi.getHdfsRegionDir();
2208 FileSystem fs = regionDir.getFileSystem(getConf());
2209 List<Path> familyDirs = FSUtils.getFamilyDirs(fs, regionDir);
2210 for (Path familyDir : familyDirs) {
2211 List<Path> referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir);
2212 for (Path referenceFilePath : referenceFilePaths) {
2213 Path parentRegionDir =
2214 StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent();
2215 if (parentRegionDir.toString().endsWith(region.getEncodedName())) {
2216 LOG.warn(hri + " start and stop keys are in the range of " + region
2217 + ". The region might not be cleaned up from hdfs when region " + region
2218 + " split failed. Hence deleting from hdfs.");
2219 HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs,
2220 regionDir.getParent(), hri);
2221 return;
2222 }
2223 }
2224 }
2225 }
2226 }
2227
2228 LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI());
2229 int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication();
2230 HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(),
2231 admin.getClusterStatus().getServers(), numReplicas);
2232
2233 tryAssignmentRepair(hbi, "Trying to reassign region...");
2234 }
2235
2236 } else if (!inMeta && inHdfs && isDeployed) {
2237 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
2238 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2239 debugLsr(hbi.getHdfsRegionDir());
2240 if (hbi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
2241
2242
2243
2244
2245 if (shouldFixAssignments()) {
2246 undeployRegionsForHbi(hbi);
2247 }
2248 }
2249 if (shouldFixMeta() && hbi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2250 if (!hbi.isHdfsRegioninfoPresent()) {
2251 LOG.error("This should have been repaired in table integrity repair phase");
2252 return;
2253 }
2254
2255 LOG.info("Patching hbase:meta with with .regioninfo: " + hbi.getHdfsHRI());
2256 int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication();
2257 HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(),
2258 admin.getClusterStatus().getServers(), numReplicas);
2259 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
2260 }
2261
2262
2263 } else if (inMeta && inHdfs && !isDeployed && splitParent) {
2264
2265
2266 if (hbi.metaEntry.splitA != null && hbi.metaEntry.splitB != null) {
2267
2268 HbckInfo infoA = this.regionInfoMap.get(hbi.metaEntry.splitA.getEncodedName());
2269 HbckInfo infoB = this.regionInfoMap.get(hbi.metaEntry.splitB.getEncodedName());
2270 if (infoA != null && infoB != null) {
2271
2272 hbi.setSkipChecks(true);
2273 return;
2274 }
2275 }
2276 errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
2277 + descriptiveName + " is a split parent in META, in HDFS, "
2278 + "and not deployed on any region server. This could be transient.");
2279 if (shouldFixSplitParents()) {
2280 setShouldRerun();
2281 resetSplitParent(hbi);
2282 }
2283 } else if (inMeta && !inHdfs && !isDeployed) {
2284 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
2285 + descriptiveName + " found in META, but not in HDFS "
2286 + "or deployed on any region server.");
2287 if (shouldFixMeta()) {
2288 deleteMetaRegion(hbi);
2289 }
2290 } else if (inMeta && !inHdfs && isDeployed) {
2291 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
2292 + " found in META, but not in HDFS, " +
2293 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2294
2295
2296
2297 if (shouldFixAssignments()) {
2298 errors.print("Trying to fix unassigned region...");
2299 undeployRegions(hbi);
2300 }
2301 if (shouldFixMeta()) {
2302
2303 deleteMetaRegion(hbi);
2304 }
2305 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
2306 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
2307 + " not deployed on any region server.");
2308 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
2309 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
2310 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
2311 "Region " + descriptiveName + " should not be deployed according " +
2312 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2313 if (shouldFixAssignments()) {
2314 errors.print("Trying to close the region " + descriptiveName);
2315 setShouldRerun();
2316 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2317 }
2318 } else if (inMeta && inHdfs && isMultiplyDeployed) {
2319 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
2320 + " is listed in hbase:meta on region server " + hbi.metaEntry.regionServer
2321 + " but is multiply assigned to region servers " +
2322 Joiner.on(", ").join(hbi.deployedOn));
2323
2324 if (shouldFixAssignments()) {
2325 errors.print("Trying to fix assignment error...");
2326 setShouldRerun();
2327 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2328 }
2329 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
2330 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
2331 + descriptiveName + " listed in hbase:meta on region server " +
2332 hbi.metaEntry.regionServer + " but found on region server " +
2333 hbi.deployedOn.get(0));
2334
2335 if (shouldFixAssignments()) {
2336 errors.print("Trying to fix assignment error...");
2337 setShouldRerun();
2338 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2339 HBaseFsckRepair.waitUntilAssigned(admin, hbi.getHdfsHRI());
2340 }
2341 } else {
2342 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
2343 " is in an unforeseen state:" +
2344 " inMeta=" + inMeta +
2345 " inHdfs=" + inHdfs +
2346 " isDeployed=" + isDeployed +
2347 " isMultiplyDeployed=" + isMultiplyDeployed +
2348 " deploymentMatchesMeta=" + deploymentMatchesMeta +
2349 " shouldBeDeployed=" + shouldBeDeployed);
2350 }
2351 }
2352
2353
2354
2355
2356
2357
2358
2359 SortedMap<TableName, TableInfo> checkIntegrity() throws IOException {
2360 tablesInfo = new TreeMap<TableName,TableInfo> ();
2361 LOG.debug("There are " + regionInfoMap.size() + " region info entries");
2362 for (HbckInfo hbi : regionInfoMap.values()) {
2363
2364 if (hbi.metaEntry == null) {
2365
2366 Path p = hbi.getHdfsRegionDir();
2367 if (p == null) {
2368 errors.report("No regioninfo in Meta or HDFS. " + hbi);
2369 }
2370
2371
2372 continue;
2373 }
2374 if (hbi.metaEntry.regionServer == null) {
2375 errors.detail("Skipping region because no region server: " + hbi);
2376 continue;
2377 }
2378 if (hbi.metaEntry.isOffline()) {
2379 errors.detail("Skipping region because it is offline: " + hbi);
2380 continue;
2381 }
2382 if (hbi.containsOnlyHdfsEdits()) {
2383 errors.detail("Skipping region because it only contains edits" + hbi);
2384 continue;
2385 }
2386
2387
2388
2389
2390
2391
2392 if (hbi.deployedOn.size() == 0) continue;
2393
2394
2395 TableName tableName = hbi.metaEntry.getTable();
2396 TableInfo modTInfo = tablesInfo.get(tableName);
2397 if (modTInfo == null) {
2398 modTInfo = new TableInfo(tableName);
2399 }
2400 for (ServerName server : hbi.deployedOn) {
2401 modTInfo.addServer(server);
2402 }
2403
2404 if (!hbi.isSkipChecks()) {
2405 modTInfo.addRegionInfo(hbi);
2406 }
2407
2408 tablesInfo.put(tableName, modTInfo);
2409 }
2410
2411 loadTableInfosForTablesWithNoRegion();
2412
2413 logParallelMerge();
2414 for (TableInfo tInfo : tablesInfo.values()) {
2415 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
2416 if (!tInfo.checkRegionChain(handler)) {
2417 errors.report("Found inconsistency in table " + tInfo.getName());
2418 }
2419 }
2420 return tablesInfo;
2421 }
2422
2423
2424
2425
2426 private void loadTableInfosForTablesWithNoRegion() throws IOException {
2427 Map<String, HTableDescriptor> allTables = new FSTableDescriptors(getConf()).getAll();
2428 for (HTableDescriptor htd : allTables.values()) {
2429 if (checkMetaOnly && !htd.isMetaTable()) {
2430 continue;
2431 }
2432
2433 TableName tableName = htd.getTableName();
2434 if (isTableIncluded(tableName) && !tablesInfo.containsKey(tableName)) {
2435 TableInfo tableInfo = new TableInfo(tableName);
2436 tableInfo.htds.add(htd);
2437 tablesInfo.put(htd.getTableName(), tableInfo);
2438 }
2439 }
2440 }
2441
2442
2443
2444
2445
2446 public int mergeRegionDirs(Path targetRegionDir, HbckInfo contained) throws IOException {
2447 int fileMoves = 0;
2448 String thread = Thread.currentThread().getName();
2449 LOG.debug("[" + thread + "] Contained region dir after close and pause");
2450 debugLsr(contained.getHdfsRegionDir());
2451
2452
2453 FileSystem fs = targetRegionDir.getFileSystem(getConf());
2454 FileStatus[] dirs = null;
2455 try {
2456 dirs = fs.listStatus(contained.getHdfsRegionDir());
2457 } catch (FileNotFoundException fnfe) {
2458
2459
2460 if (!fs.exists(contained.getHdfsRegionDir())) {
2461 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2462 + " is missing. Assuming already sidelined or moved.");
2463 } else {
2464 sidelineRegionDir(fs, contained);
2465 }
2466 return fileMoves;
2467 }
2468
2469 if (dirs == null) {
2470 if (!fs.exists(contained.getHdfsRegionDir())) {
2471 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2472 + " already sidelined.");
2473 } else {
2474 sidelineRegionDir(fs, contained);
2475 }
2476 return fileMoves;
2477 }
2478
2479 for (FileStatus cf : dirs) {
2480 Path src = cf.getPath();
2481 Path dst = new Path(targetRegionDir, src.getName());
2482
2483 if (src.getName().equals(HRegionFileSystem.REGION_INFO_FILE)) {
2484
2485 continue;
2486 }
2487
2488 if (src.getName().equals(HConstants.HREGION_OLDLOGDIR_NAME)) {
2489
2490 continue;
2491 }
2492
2493 LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst);
2494
2495
2496
2497
2498 for (FileStatus hfile : fs.listStatus(src)) {
2499 boolean success = fs.rename(hfile.getPath(), dst);
2500 if (success) {
2501 fileMoves++;
2502 }
2503 }
2504 LOG.debug("[" + thread + "] Sideline directory contents:");
2505 debugLsr(targetRegionDir);
2506 }
2507
2508
2509 sidelineRegionDir(fs, contained);
2510 LOG.info("[" + thread + "] Sidelined region dir "+ contained.getHdfsRegionDir() + " into " +
2511 getSidelineDir());
2512 debugLsr(contained.getHdfsRegionDir());
2513
2514 return fileMoves;
2515 }
2516
2517
2518 static class WorkItemOverlapMerge implements Callable<Void> {
2519 private TableIntegrityErrorHandler handler;
2520 Collection<HbckInfo> overlapgroup;
2521
2522 WorkItemOverlapMerge(Collection<HbckInfo> overlapgroup, TableIntegrityErrorHandler handler) {
2523 this.handler = handler;
2524 this.overlapgroup = overlapgroup;
2525 }
2526
2527 @Override
2528 public Void call() throws Exception {
2529 handler.handleOverlapGroup(overlapgroup);
2530 return null;
2531 }
2532 };
2533
2534
2535
2536
2537
2538 public class TableInfo {
2539 TableName tableName;
2540 TreeSet <ServerName> deployedOn;
2541
2542
2543 final List<HbckInfo> backwards = new ArrayList<HbckInfo>();
2544
2545
2546 final Map<Path, HbckInfo> sidelinedRegions = new HashMap<Path, HbckInfo>();
2547
2548
2549 final RegionSplitCalculator<HbckInfo> sc = new RegionSplitCalculator<HbckInfo>(cmp);
2550
2551
2552 final Set<HTableDescriptor> htds = new HashSet<HTableDescriptor>();
2553
2554
2555 final Multimap<byte[], HbckInfo> overlapGroups =
2556 TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
2557
2558
2559 private ImmutableList<HRegionInfo> regionsFromMeta = null;
2560
2561 TableInfo(TableName name) {
2562 this.tableName = name;
2563 deployedOn = new TreeSet <ServerName>();
2564 }
2565
2566
2567
2568
2569 private HTableDescriptor getHTD() {
2570 if (htds.size() == 1) {
2571 return (HTableDescriptor)htds.toArray()[0];
2572 } else {
2573 LOG.error("None/Multiple table descriptors found for table '"
2574 + tableName + "' regions: " + htds);
2575 }
2576 return null;
2577 }
2578
2579 public void addRegionInfo(HbckInfo hir) {
2580 if (Bytes.equals(hir.getEndKey(), HConstants.EMPTY_END_ROW)) {
2581
2582
2583 if (hir.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) sc.add(hir);
2584 return;
2585 }
2586
2587
2588 if (Bytes.compareTo(hir.getStartKey(), hir.getEndKey()) > 0) {
2589 errors.reportError(
2590 ERROR_CODE.REGION_CYCLE,
2591 String.format("The endkey for this region comes before the "
2592 + "startkey, startkey=%s, endkey=%s",
2593 Bytes.toStringBinary(hir.getStartKey()),
2594 Bytes.toStringBinary(hir.getEndKey())), this, hir);
2595 backwards.add(hir);
2596 return;
2597 }
2598
2599
2600
2601 if (hir.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) sc.add(hir);
2602 }
2603
2604 public void addServer(ServerName server) {
2605 this.deployedOn.add(server);
2606 }
2607
2608 public TableName getName() {
2609 return tableName;
2610 }
2611
2612 public int getNumRegions() {
2613 return sc.getStarts().size() + backwards.size();
2614 }
2615
2616 public synchronized ImmutableList<HRegionInfo> getRegionsFromMeta() {
2617
2618 if (regionsFromMeta == null) {
2619 List<HRegionInfo> regions = new ArrayList<HRegionInfo>();
2620 for (HbckInfo h : HBaseFsck.this.regionInfoMap.values()) {
2621 if (tableName.equals(h.getTableName())) {
2622 if (h.metaEntry != null) {
2623 regions.add((HRegionInfo) h.metaEntry);
2624 }
2625 }
2626 }
2627 regionsFromMeta = Ordering.natural().immutableSortedCopy(regions);
2628 }
2629
2630 return regionsFromMeta;
2631 }
2632
2633
2634 private class IntegrityFixSuggester extends TableIntegrityErrorHandlerImpl {
2635 ErrorReporter errors;
2636
2637 IntegrityFixSuggester(TableInfo ti, ErrorReporter errors) {
2638 this.errors = errors;
2639 setTableInfo(ti);
2640 }
2641
2642 @Override
2643 public void handleRegionStartKeyNotEmpty(HbckInfo hi) throws IOException{
2644 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2645 "First region should start with an empty key. You need to "
2646 + " create a new region and regioninfo in HDFS to plug the hole.",
2647 getTableInfo(), hi);
2648 }
2649
2650 @Override
2651 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2652 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2653 "Last region should end with an empty key. You need to "
2654 + "create a new region and regioninfo in HDFS to plug the hole.", getTableInfo());
2655 }
2656
2657 @Override
2658 public void handleDegenerateRegion(HbckInfo hi) throws IOException{
2659 errors.reportError(ERROR_CODE.DEGENERATE_REGION,
2660 "Region has the same start and end key.", getTableInfo(), hi);
2661 }
2662
2663 @Override
2664 public void handleDuplicateStartKeys(HbckInfo r1, HbckInfo r2) throws IOException{
2665 byte[] key = r1.getStartKey();
2666
2667 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2668 "Multiple regions have the same startkey: "
2669 + Bytes.toStringBinary(key), getTableInfo(), r1);
2670 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2671 "Multiple regions have the same startkey: "
2672 + Bytes.toStringBinary(key), getTableInfo(), r2);
2673 }
2674
2675 @Override
2676 public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) throws IOException{
2677 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
2678 "There is an overlap in the region chain.",
2679 getTableInfo(), hi1, hi2);
2680 }
2681
2682 @Override
2683 public void handleHoleInRegionChain(byte[] holeStart, byte[] holeStop) throws IOException{
2684 errors.reportError(
2685 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2686 "There is a hole in the region chain between "
2687 + Bytes.toStringBinary(holeStart) + " and "
2688 + Bytes.toStringBinary(holeStop)
2689 + ". You need to create a new .regioninfo and region "
2690 + "dir in hdfs to plug the hole.");
2691 }
2692 };
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706 private class HDFSIntegrityFixer extends IntegrityFixSuggester {
2707 Configuration conf;
2708
2709 boolean fixOverlaps = true;
2710
2711 HDFSIntegrityFixer(TableInfo ti, ErrorReporter errors, Configuration conf,
2712 boolean fixHoles, boolean fixOverlaps) {
2713 super(ti, errors);
2714 this.conf = conf;
2715 this.fixOverlaps = fixOverlaps;
2716
2717 }
2718
2719
2720
2721
2722
2723
2724 @Override
2725 public void handleRegionStartKeyNotEmpty(HbckInfo next) throws IOException {
2726 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2727 "First region should start with an empty key. Creating a new " +
2728 "region and regioninfo in HDFS to plug the hole.",
2729 getTableInfo(), next);
2730 HTableDescriptor htd = getTableInfo().getHTD();
2731
2732 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(),
2733 HConstants.EMPTY_START_ROW, next.getStartKey());
2734
2735
2736 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2737 LOG.info("Table region start key was not empty. Created new empty region: "
2738 + newRegion + " " +region);
2739 fixes++;
2740 }
2741
2742 @Override
2743 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2744 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2745 "Last region should end with an empty key. Creating a new "
2746 + "region and regioninfo in HDFS to plug the hole.", getTableInfo());
2747 HTableDescriptor htd = getTableInfo().getHTD();
2748
2749 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), curEndKey,
2750 HConstants.EMPTY_START_ROW);
2751
2752 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2753 LOG.info("Table region end key was not empty. Created new empty region: " + newRegion
2754 + " " + region);
2755 fixes++;
2756 }
2757
2758
2759
2760
2761
2762 @Override
2763 public void handleHoleInRegionChain(byte[] holeStartKey, byte[] holeStopKey) throws IOException {
2764 errors.reportError(
2765 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2766 "There is a hole in the region chain between "
2767 + Bytes.toStringBinary(holeStartKey) + " and "
2768 + Bytes.toStringBinary(holeStopKey)
2769 + ". Creating a new regioninfo and region "
2770 + "dir in hdfs to plug the hole.");
2771 HTableDescriptor htd = getTableInfo().getHTD();
2772 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), holeStartKey, holeStopKey);
2773 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2774 LOG.info("Plugged hole by creating new empty region: "+ newRegion + " " +region);
2775 fixes++;
2776 }
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789 @Override
2790 public void handleOverlapGroup(Collection<HbckInfo> overlap)
2791 throws IOException {
2792 Preconditions.checkNotNull(overlap);
2793 Preconditions.checkArgument(overlap.size() >0);
2794
2795 if (!this.fixOverlaps) {
2796 LOG.warn("Not attempting to repair overlaps.");
2797 return;
2798 }
2799
2800 if (overlap.size() > maxMerge) {
2801 LOG.warn("Overlap group has " + overlap.size() + " overlapping " +
2802 "regions which is greater than " + maxMerge + ", the max number of regions to merge");
2803 if (sidelineBigOverlaps) {
2804
2805 sidelineBigOverlaps(overlap);
2806 }
2807 return;
2808 }
2809
2810 mergeOverlaps(overlap);
2811 }
2812
2813 void mergeOverlaps(Collection<HbckInfo> overlap)
2814 throws IOException {
2815 String thread = Thread.currentThread().getName();
2816 LOG.info("== [" + thread + "] Merging regions into one region: "
2817 + Joiner.on(",").join(overlap));
2818
2819 Pair<byte[], byte[]> range = null;
2820 for (HbckInfo hi : overlap) {
2821 if (range == null) {
2822 range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
2823 } else {
2824 if (RegionSplitCalculator.BYTES_COMPARATOR
2825 .compare(hi.getStartKey(), range.getFirst()) < 0) {
2826 range.setFirst(hi.getStartKey());
2827 }
2828 if (RegionSplitCalculator.BYTES_COMPARATOR
2829 .compare(hi.getEndKey(), range.getSecond()) > 0) {
2830 range.setSecond(hi.getEndKey());
2831 }
2832 }
2833
2834 LOG.debug("[" + thread + "] Closing region before moving data around: " + hi);
2835 LOG.debug("[" + thread + "] Contained region dir before close");
2836 debugLsr(hi.getHdfsRegionDir());
2837 try {
2838 LOG.info("[" + thread + "] Closing region: " + hi);
2839 closeRegion(hi);
2840 } catch (IOException ioe) {
2841 LOG.warn("[" + thread + "] Was unable to close region " + hi
2842 + ". Just continuing... ", ioe);
2843 } catch (InterruptedException e) {
2844 LOG.warn("[" + thread + "] Was unable to close region " + hi
2845 + ". Just continuing... ", e);
2846 }
2847
2848 try {
2849 LOG.info("[" + thread + "] Offlining region: " + hi);
2850 offline(hi.getRegionName());
2851 } catch (IOException ioe) {
2852 LOG.warn("[" + thread + "] Unable to offline region from master: " + hi
2853 + ". Just continuing... ", ioe);
2854 }
2855 }
2856
2857
2858 HTableDescriptor htd = getTableInfo().getHTD();
2859
2860 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), range.getFirst(),
2861 range.getSecond());
2862 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2863 LOG.info("[" + thread + "] Created new empty container region: " +
2864 newRegion + " to contain regions: " + Joiner.on(",").join(overlap));
2865 debugLsr(region.getRegionFileSystem().getRegionDir());
2866
2867
2868 boolean didFix= false;
2869 Path target = region.getRegionFileSystem().getRegionDir();
2870 for (HbckInfo contained : overlap) {
2871 LOG.info("[" + thread + "] Merging " + contained + " into " + target );
2872 int merges = mergeRegionDirs(target, contained);
2873 if (merges > 0) {
2874 didFix = true;
2875 }
2876 }
2877 if (didFix) {
2878 fixes++;
2879 }
2880 }
2881
2882
2883
2884
2885
2886
2887
2888
2889 void sidelineBigOverlaps(
2890 Collection<HbckInfo> bigOverlap) throws IOException {
2891 int overlapsToSideline = bigOverlap.size() - maxMerge;
2892 if (overlapsToSideline > maxOverlapsToSideline) {
2893 overlapsToSideline = maxOverlapsToSideline;
2894 }
2895 List<HbckInfo> regionsToSideline =
2896 RegionSplitCalculator.findBigRanges(bigOverlap, overlapsToSideline);
2897 FileSystem fs = FileSystem.get(conf);
2898 for (HbckInfo regionToSideline: regionsToSideline) {
2899 try {
2900 LOG.info("Closing region: " + regionToSideline);
2901 closeRegion(regionToSideline);
2902 } catch (IOException ioe) {
2903 LOG.warn("Was unable to close region " + regionToSideline
2904 + ". Just continuing... ", ioe);
2905 } catch (InterruptedException e) {
2906 LOG.warn("Was unable to close region " + regionToSideline
2907 + ". Just continuing... ", e);
2908 }
2909
2910 try {
2911 LOG.info("Offlining region: " + regionToSideline);
2912 offline(regionToSideline.getRegionName());
2913 } catch (IOException ioe) {
2914 LOG.warn("Unable to offline region from master: " + regionToSideline
2915 + ". Just continuing... ", ioe);
2916 }
2917
2918 LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
2919 Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);
2920 if (sidelineRegionDir != null) {
2921 sidelinedRegions.put(sidelineRegionDir, regionToSideline);
2922 LOG.info("After sidelined big overlapped region: "
2923 + regionToSideline.getRegionNameAsString()
2924 + " to " + sidelineRegionDir.toString());
2925 fixes++;
2926 }
2927 }
2928 }
2929 }
2930
2931
2932
2933
2934
2935
2936
2937 public boolean checkRegionChain(TableIntegrityErrorHandler handler) throws IOException {
2938
2939
2940
2941 if (disabledTables.contains(this.tableName)) {
2942 return true;
2943 }
2944 int originalErrorsCount = errors.getErrorList().size();
2945 Multimap<byte[], HbckInfo> regions = sc.calcCoverage();
2946 SortedSet<byte[]> splits = sc.getSplits();
2947
2948 byte[] prevKey = null;
2949 byte[] problemKey = null;
2950
2951 if (splits.size() == 0) {
2952
2953 handler.handleHoleInRegionChain(HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW);
2954 }
2955
2956 for (byte[] key : splits) {
2957 Collection<HbckInfo> ranges = regions.get(key);
2958 if (prevKey == null && !Bytes.equals(key, HConstants.EMPTY_BYTE_ARRAY)) {
2959 for (HbckInfo rng : ranges) {
2960 handler.handleRegionStartKeyNotEmpty(rng);
2961 }
2962 }
2963
2964
2965 for (HbckInfo rng : ranges) {
2966
2967 byte[] endKey = rng.getEndKey();
2968 endKey = (endKey.length == 0) ? null : endKey;
2969 if (Bytes.equals(rng.getStartKey(),endKey)) {
2970 handler.handleDegenerateRegion(rng);
2971 }
2972 }
2973
2974 if (ranges.size() == 1) {
2975
2976 if (problemKey != null) {
2977 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2978 }
2979 problemKey = null;
2980 } else if (ranges.size() > 1) {
2981
2982
2983 if (problemKey == null) {
2984
2985 LOG.warn("Naming new problem group: " + Bytes.toStringBinary(key));
2986 problemKey = key;
2987 }
2988 overlapGroups.putAll(problemKey, ranges);
2989
2990
2991 ArrayList<HbckInfo> subRange = new ArrayList<HbckInfo>(ranges);
2992
2993 for (HbckInfo r1 : ranges) {
2994 if (r1.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) continue;
2995 subRange.remove(r1);
2996 for (HbckInfo r2 : subRange) {
2997 if (r2.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) continue;
2998 if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
2999 handler.handleDuplicateStartKeys(r1,r2);
3000 } else {
3001
3002 handler.handleOverlapInRegionChain(r1, r2);
3003 }
3004 }
3005 }
3006
3007 } else if (ranges.size() == 0) {
3008 if (problemKey != null) {
3009 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
3010 }
3011 problemKey = null;
3012
3013 byte[] holeStopKey = sc.getSplits().higher(key);
3014
3015 if (holeStopKey != null) {
3016
3017 handler.handleHoleInRegionChain(key, holeStopKey);
3018 }
3019 }
3020 prevKey = key;
3021 }
3022
3023
3024
3025 if (prevKey != null) {
3026 handler.handleRegionEndKeyNotEmpty(prevKey);
3027 }
3028
3029
3030 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
3031 boolean ok = handleOverlapsParallel(handler, prevKey);
3032 if (!ok) {
3033 return false;
3034 }
3035 } else {
3036 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
3037 handler.handleOverlapGroup(overlap);
3038 }
3039 }
3040
3041 if (details) {
3042
3043 errors.print("---- Table '" + this.tableName
3044 + "': region split map");
3045 dump(splits, regions);
3046 errors.print("---- Table '" + this.tableName
3047 + "': overlap groups");
3048 dumpOverlapProblems(overlapGroups);
3049 errors.print("There are " + overlapGroups.keySet().size()
3050 + " overlap groups with " + overlapGroups.size()
3051 + " overlapping regions");
3052 }
3053 if (!sidelinedRegions.isEmpty()) {
3054 LOG.warn("Sidelined big overlapped regions, please bulk load them!");
3055 errors.print("---- Table '" + this.tableName
3056 + "': sidelined big overlapped regions");
3057 dumpSidelinedRegions(sidelinedRegions);
3058 }
3059 return errors.getErrorList().size() == originalErrorsCount;
3060 }
3061
3062 private boolean handleOverlapsParallel(TableIntegrityErrorHandler handler, byte[] prevKey)
3063 throws IOException {
3064
3065
3066 List<WorkItemOverlapMerge> merges = new ArrayList<WorkItemOverlapMerge>(overlapGroups.size());
3067 List<Future<Void>> rets;
3068 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
3069
3070 merges.add(new WorkItemOverlapMerge(overlap, handler));
3071 }
3072 try {
3073 rets = executor.invokeAll(merges);
3074 } catch (InterruptedException e) {
3075 LOG.error("Overlap merges were interrupted", e);
3076 return false;
3077 }
3078 for(int i=0; i<merges.size(); i++) {
3079 WorkItemOverlapMerge work = merges.get(i);
3080 Future<Void> f = rets.get(i);
3081 try {
3082 f.get();
3083 } catch(ExecutionException e) {
3084 LOG.warn("Failed to merge overlap group" + work, e.getCause());
3085 } catch (InterruptedException e) {
3086 LOG.error("Waiting for overlap merges was interrupted", e);
3087 return false;
3088 }
3089 }
3090 return true;
3091 }
3092
3093
3094
3095
3096
3097
3098
3099 void dump(SortedSet<byte[]> splits, Multimap<byte[], HbckInfo> regions) {
3100
3101 StringBuilder sb = new StringBuilder();
3102 for (byte[] k : splits) {
3103 sb.setLength(0);
3104 sb.append(Bytes.toStringBinary(k) + ":\t");
3105 for (HbckInfo r : regions.get(k)) {
3106 sb.append("[ "+ r.toString() + ", "
3107 + Bytes.toStringBinary(r.getEndKey())+ "]\t");
3108 }
3109 errors.print(sb.toString());
3110 }
3111 }
3112 }
3113
3114 public void dumpOverlapProblems(Multimap<byte[], HbckInfo> regions) {
3115
3116
3117 for (byte[] k : regions.keySet()) {
3118 errors.print(Bytes.toStringBinary(k) + ":");
3119 for (HbckInfo r : regions.get(k)) {
3120 errors.print("[ " + r.toString() + ", "
3121 + Bytes.toStringBinary(r.getEndKey()) + "]");
3122 }
3123 errors.print("----");
3124 }
3125 }
3126
3127 public void dumpSidelinedRegions(Map<Path, HbckInfo> regions) {
3128 for (Map.Entry<Path, HbckInfo> entry: regions.entrySet()) {
3129 TableName tableName = entry.getValue().getTableName();
3130 Path path = entry.getKey();
3131 errors.print("This sidelined region dir should be bulk loaded: "
3132 + path.toString());
3133 errors.print("Bulk load command looks like: "
3134 + "hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles "
3135 + path.toUri().getPath() + " "+ tableName);
3136 }
3137 }
3138
3139 public Multimap<byte[], HbckInfo> getOverlapGroups(
3140 TableName table) {
3141 TableInfo ti = tablesInfo.get(table);
3142 return ti.overlapGroups;
3143 }
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
3155 List<TableName> tableNames = new ArrayList<TableName>();
3156 long now = EnvironmentEdgeManager.currentTime();
3157
3158 for (HbckInfo hbi : regionInfoMap.values()) {
3159 MetaEntry info = hbi.metaEntry;
3160
3161
3162
3163 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
3164 if (info.modTime + timelag < now) {
3165 tableNames.add(info.getTable());
3166 } else {
3167 numSkipped.incrementAndGet();
3168 }
3169 }
3170 }
3171 return getHTableDescriptors(tableNames);
3172 }
3173
3174 HTableDescriptor[] getHTableDescriptors(List<TableName> tableNames) {
3175 HTableDescriptor[] htd = new HTableDescriptor[0];
3176 Admin admin = null;
3177 try {
3178 LOG.info("getHTableDescriptors == tableNames => " + tableNames);
3179 admin = new HBaseAdmin(getConf());
3180 htd = admin.getTableDescriptorsByTableName(tableNames);
3181 } catch (IOException e) {
3182 LOG.debug("Exception getting table descriptors", e);
3183 } finally {
3184 if (admin != null) {
3185 try {
3186 admin.close();
3187 } catch (IOException e) {
3188 LOG.debug("Exception closing HBaseAdmin", e);
3189 }
3190 }
3191 }
3192 return htd;
3193 }
3194
3195
3196
3197
3198
3199
3200 private synchronized HbckInfo getOrCreateInfo(String name) {
3201 HbckInfo hbi = regionInfoMap.get(name);
3202 if (hbi == null) {
3203 hbi = new HbckInfo(null);
3204 regionInfoMap.put(name, hbi);
3205 }
3206 return hbi;
3207 }
3208
3209 private void checkAndFixTableLocks() throws IOException {
3210 ZooKeeperWatcher zkw = createZooKeeperWatcher();
3211
3212 try {
3213 TableLockChecker checker = new TableLockChecker(zkw, errors);
3214 checker.checkTableLocks();
3215
3216 if (this.fixTableLocks) {
3217 checker.fixExpiredTableLocks();
3218 }
3219 } finally {
3220 zkw.close();
3221 }
3222 }
3223
3224
3225
3226
3227
3228
3229
3230 private void checkAndFixOrphanedTableZNodes()
3231 throws IOException, KeeperException, InterruptedException {
3232 ZooKeeperWatcher zkw = createZooKeeperWatcher();
3233
3234 try {
3235 Set<TableName> enablingTables = ZKTableStateClientSideReader.getEnablingTables(zkw);
3236 String msg;
3237 TableInfo tableInfo;
3238
3239 for (TableName tableName : enablingTables) {
3240
3241 tableInfo = tablesInfo.get(tableName);
3242 if (tableInfo != null) {
3243
3244 continue;
3245 }
3246
3247 msg = "Table " + tableName + " not found in hbase:meta. Orphaned table ZNode found.";
3248 LOG.warn(msg);
3249 orphanedTableZNodes.add(tableName);
3250 errors.reportError(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY, msg);
3251 }
3252
3253 if (orphanedTableZNodes.size() > 0 && this.fixTableZNodes) {
3254 ZKTableStateManager zkTableStateMgr = new ZKTableStateManager(zkw);
3255
3256 for (TableName tableName : orphanedTableZNodes) {
3257 try {
3258
3259
3260
3261
3262 zkTableStateMgr.setTableState(tableName, ZooKeeperProtos.Table.State.DISABLED);
3263 } catch (CoordinatedStateException e) {
3264
3265 LOG.error(
3266 "Got a CoordinatedStateException while fixing the ENABLING table znode " + tableName,
3267 e);
3268 }
3269 }
3270 }
3271 } finally {
3272 zkw.close();
3273 }
3274 }
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285 boolean checkMetaRegion() throws IOException, KeeperException, InterruptedException {
3286 Map<Integer, HbckInfo> metaRegions = new HashMap<Integer, HbckInfo>();
3287 for (HbckInfo value : regionInfoMap.values()) {
3288 if (value.metaEntry != null && value.metaEntry.isMetaRegion()) {
3289 metaRegions.put(value.getReplicaId(), value);
3290 }
3291 }
3292 int metaReplication = admin.getTableDescriptor(TableName.META_TABLE_NAME)
3293 .getRegionReplication();
3294 boolean noProblem = true;
3295
3296
3297 for (int i = 0; i < metaReplication; i++) {
3298 HbckInfo metaHbckInfo = metaRegions.remove(i);
3299 List<ServerName> servers = new ArrayList<ServerName>();
3300 if (metaHbckInfo != null) {
3301 servers = metaHbckInfo.deployedOn;
3302 }
3303 if (servers.size() != 1) {
3304 noProblem = false;
3305 if (servers.size() == 0) {
3306 assignMetaReplica(i);
3307 } else if (servers.size() > 1) {
3308 errors
3309 .reportError(ERROR_CODE.MULTI_META_REGION, "hbase:meta, replicaId " +
3310 metaHbckInfo.getReplicaId() + " is found on more than one region.");
3311 if (shouldFixAssignments()) {
3312 errors.print("Trying to fix a problem with hbase:meta, replicaId " +
3313 metaHbckInfo.getReplicaId() +"..");
3314 setShouldRerun();
3315
3316 HBaseFsckRepair.fixMultiAssignment(connection, metaHbckInfo.metaEntry, servers);
3317 }
3318 }
3319 }
3320 }
3321
3322 for (Map.Entry<Integer, HbckInfo> entry : metaRegions.entrySet()) {
3323 noProblem = false;
3324 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
3325 "hbase:meta replicas are deployed in excess. Configured " + metaReplication +
3326 ", deployed " + metaRegions.size());
3327 if (shouldFixAssignments()) {
3328 errors.print("Trying to undeploy excess replica, replicaId: " + entry.getKey() +
3329 " of hbase:meta..");
3330 setShouldRerun();
3331 unassignMetaReplica(entry.getValue());
3332 }
3333 }
3334
3335
3336 return noProblem;
3337 }
3338
3339 private void unassignMetaReplica(HbckInfo hi) throws IOException, InterruptedException,
3340 KeeperException {
3341 undeployRegions(hi);
3342 ZooKeeperWatcher zkw = createZooKeeperWatcher();
3343 try {
3344 ZKUtil.deleteNode(zkw, zkw.getZNodeForReplica(hi.metaEntry.getReplicaId()));
3345 } finally {
3346 zkw.close();
3347 }
3348 }
3349
3350 private void assignMetaReplica(int replicaId)
3351 throws IOException, KeeperException, InterruptedException {
3352 errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta, replicaId " +
3353 replicaId +" is not found on any region.");
3354 if (shouldFixAssignments()) {
3355 errors.print("Trying to fix a problem with hbase:meta..");
3356 setShouldRerun();
3357
3358 HRegionInfo h = RegionReplicaUtil.getRegionInfoForReplica(
3359 HRegionInfo.FIRST_META_REGIONINFO, replicaId);
3360 HBaseFsckRepair.fixUnassigned(admin, h);
3361 HBaseFsckRepair.waitUntilAssigned(admin, h);
3362 }
3363 }
3364
3365
3366
3367
3368
3369 boolean loadMetaEntries() throws IOException {
3370 MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
3371 int countRecord = 1;
3372
3373
3374 final Comparator<Cell> comp = new Comparator<Cell>() {
3375 @Override
3376 public int compare(Cell k1, Cell k2) {
3377 return Long.compare(k1.getTimestamp(), k2.getTimestamp());
3378 }
3379 };
3380
3381 @Override
3382 public boolean processRow(Result result) throws IOException {
3383 try {
3384
3385
3386 long ts = Collections.max(result.listCells(), comp).getTimestamp();
3387 RegionLocations rl = MetaTableAccessor.getRegionLocations(result);
3388 if (rl == null) {
3389 emptyRegionInfoQualifiers.add(result);
3390 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
3391 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
3392 return true;
3393 }
3394 ServerName sn = null;
3395 if (rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID) == null ||
3396 rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID).getRegionInfo() == null) {
3397 emptyRegionInfoQualifiers.add(result);
3398 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
3399 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
3400 return true;
3401 }
3402 HRegionInfo hri = rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID).getRegionInfo();
3403 if (!(isTableIncluded(hri.getTable())
3404 || hri.isMetaRegion())) {
3405 return true;
3406 }
3407 PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(result);
3408 for (HRegionLocation h : rl.getRegionLocations()) {
3409 if (h == null || h.getRegionInfo() == null) {
3410 continue;
3411 }
3412 sn = h.getServerName();
3413 hri = h.getRegionInfo();
3414
3415 MetaEntry m = null;
3416 if (hri.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
3417 m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond());
3418 } else {
3419 m = new MetaEntry(hri, sn, ts, null, null);
3420 }
3421 HbckInfo previous = regionInfoMap.get(hri.getEncodedName());
3422 if (previous == null) {
3423 regionInfoMap.put(hri.getEncodedName(), new HbckInfo(m));
3424 } else if (previous.metaEntry == null) {
3425 previous.metaEntry = m;
3426 } else {
3427 throw new IOException("Two entries in hbase:meta are same " + previous);
3428 }
3429 }
3430 PairOfSameType<HRegionInfo> mergeRegions = HRegionInfo.getMergeRegions(result);
3431 for (HRegionInfo mergeRegion : new HRegionInfo[] {
3432 mergeRegions.getFirst(), mergeRegions.getSecond() }) {
3433 if (mergeRegion != null) {
3434
3435 HbckInfo hbInfo = getOrCreateInfo(mergeRegion.getEncodedName());
3436 hbInfo.setMerged(true);
3437 }
3438 }
3439
3440
3441 if (countRecord % 100 == 0) {
3442 errors.progress();
3443 }
3444 countRecord++;
3445 return true;
3446 } catch (RuntimeException e) {
3447 LOG.error("Result=" + result);
3448 throw e;
3449 }
3450 }
3451 };
3452 if (!checkMetaOnly) {
3453
3454 MetaScanner.metaScan(connection, visitor);
3455 }
3456
3457 errors.print("");
3458 return true;
3459 }
3460
3461
3462
3463
3464 static class MetaEntry extends HRegionInfo {
3465 ServerName regionServer;
3466 long modTime;
3467 HRegionInfo splitA, splitB;
3468
3469 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime) {
3470 this(rinfo, regionServer, modTime, null, null);
3471 }
3472
3473 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime,
3474 HRegionInfo splitA, HRegionInfo splitB) {
3475 super(rinfo);
3476 this.regionServer = regionServer;
3477 this.modTime = modTime;
3478 this.splitA = splitA;
3479 this.splitB = splitB;
3480 }
3481
3482 @Override
3483 public boolean equals(Object o) {
3484 boolean superEq = super.equals(o);
3485 if (!superEq) {
3486 return superEq;
3487 }
3488
3489 MetaEntry me = (MetaEntry) o;
3490 if (!regionServer.equals(me.regionServer)) {
3491 return false;
3492 }
3493 return (modTime == me.modTime);
3494 }
3495
3496 @Override
3497 public int hashCode() {
3498 int hash = Arrays.hashCode(getRegionName());
3499 hash ^= getRegionId();
3500 hash ^= Arrays.hashCode(getStartKey());
3501 hash ^= Arrays.hashCode(getEndKey());
3502 hash ^= Boolean.valueOf(isOffline()).hashCode();
3503 hash ^= getTable().hashCode();
3504 if (regionServer != null) {
3505 hash ^= regionServer.hashCode();
3506 }
3507 hash ^= modTime;
3508 return hash;
3509 }
3510 }
3511
3512
3513
3514
3515 static class HdfsEntry {
3516 HRegionInfo hri;
3517 Path hdfsRegionDir = null;
3518 long hdfsRegionDirModTime = 0;
3519 boolean hdfsRegioninfoFilePresent = false;
3520 boolean hdfsOnlyEdits = false;
3521 }
3522
3523
3524
3525
3526 static class OnlineEntry {
3527 HRegionInfo hri;
3528 ServerName hsa;
3529
3530 @Override
3531 public String toString() {
3532 return hsa.toString() + ";" + hri.getRegionNameAsString();
3533 }
3534 }
3535
3536
3537
3538
3539
3540 public static class HbckInfo implements KeyRange {
3541 private MetaEntry metaEntry = null;
3542 private HdfsEntry hdfsEntry = null;
3543 private List<OnlineEntry> deployedEntries = Lists.newArrayList();
3544 private List<ServerName> deployedOn = Lists.newArrayList();
3545 private boolean skipChecks = false;
3546 private boolean isMerged = false;
3547 private int deployedReplicaId = HRegionInfo.DEFAULT_REPLICA_ID;
3548 private HRegionInfo primaryHRIForDeployedReplica = null;
3549
3550 HbckInfo(MetaEntry metaEntry) {
3551 this.metaEntry = metaEntry;
3552 }
3553
3554 public int getReplicaId() {
3555 if (metaEntry != null) return metaEntry.getReplicaId();
3556 return deployedReplicaId;
3557 }
3558
3559 public synchronized void addServer(HRegionInfo hri, ServerName server) {
3560 OnlineEntry rse = new OnlineEntry() ;
3561 rse.hri = hri;
3562 rse.hsa = server;
3563 this.deployedEntries.add(rse);
3564 this.deployedOn.add(server);
3565
3566 this.deployedReplicaId = hri.getReplicaId();
3567 this.primaryHRIForDeployedReplica =
3568 RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
3569 }
3570
3571 @Override
3572 public synchronized String toString() {
3573 StringBuilder sb = new StringBuilder();
3574 sb.append("{ meta => ");
3575 sb.append((metaEntry != null)? metaEntry.getRegionNameAsString() : "null");
3576 sb.append( ", hdfs => " + getHdfsRegionDir());
3577 sb.append( ", deployed => " + Joiner.on(", ").join(deployedEntries));
3578 sb.append( ", replicaId => " + getReplicaId());
3579 sb.append(" }");
3580 return sb.toString();
3581 }
3582
3583 @Override
3584 public byte[] getStartKey() {
3585 if (this.metaEntry != null) {
3586 return this.metaEntry.getStartKey();
3587 } else if (this.hdfsEntry != null) {
3588 return this.hdfsEntry.hri.getStartKey();
3589 } else {
3590 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3591 return null;
3592 }
3593 }
3594
3595 @Override
3596 public byte[] getEndKey() {
3597 if (this.metaEntry != null) {
3598 return this.metaEntry.getEndKey();
3599 } else if (this.hdfsEntry != null) {
3600 return this.hdfsEntry.hri.getEndKey();
3601 } else {
3602 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3603 return null;
3604 }
3605 }
3606
3607 public TableName getTableName() {
3608 if (this.metaEntry != null) {
3609 return this.metaEntry.getTable();
3610 } else if (this.hdfsEntry != null) {
3611
3612
3613 Path tableDir = this.hdfsEntry.hdfsRegionDir.getParent();
3614 return FSUtils.getTableName(tableDir);
3615 } else {
3616
3617 for (OnlineEntry e : deployedEntries) {
3618 return e.hri.getTable();
3619 }
3620 return null;
3621 }
3622 }
3623
3624 public String getRegionNameAsString() {
3625 if (metaEntry != null) {
3626 return metaEntry.getRegionNameAsString();
3627 } else if (hdfsEntry != null) {
3628 if (hdfsEntry.hri != null) {
3629 return hdfsEntry.hri.getRegionNameAsString();
3630 }
3631 } else {
3632
3633 for (OnlineEntry e : deployedEntries) {
3634 return e.hri.getRegionNameAsString();
3635 }
3636 }
3637 return null;
3638 }
3639
3640 public byte[] getRegionName() {
3641 if (metaEntry != null) {
3642 return metaEntry.getRegionName();
3643 } else if (hdfsEntry != null) {
3644 return hdfsEntry.hri.getRegionName();
3645 } else {
3646
3647 for (OnlineEntry e : deployedEntries) {
3648 return e.hri.getRegionName();
3649 }
3650 return null;
3651 }
3652 }
3653
3654 public HRegionInfo getPrimaryHRIForDeployedReplica() {
3655 return primaryHRIForDeployedReplica;
3656 }
3657
3658 Path getHdfsRegionDir() {
3659 if (hdfsEntry == null) {
3660 return null;
3661 }
3662 return hdfsEntry.hdfsRegionDir;
3663 }
3664
3665 boolean containsOnlyHdfsEdits() {
3666 if (hdfsEntry == null) {
3667 return false;
3668 }
3669 return hdfsEntry.hdfsOnlyEdits;
3670 }
3671
3672 boolean isHdfsRegioninfoPresent() {
3673 if (hdfsEntry == null) {
3674 return false;
3675 }
3676 return hdfsEntry.hdfsRegioninfoFilePresent;
3677 }
3678
3679 long getModTime() {
3680 if (hdfsEntry == null) {
3681 return 0;
3682 }
3683 return hdfsEntry.hdfsRegionDirModTime;
3684 }
3685
3686 HRegionInfo getHdfsHRI() {
3687 if (hdfsEntry == null) {
3688 return null;
3689 }
3690 return hdfsEntry.hri;
3691 }
3692
3693 public void setSkipChecks(boolean skipChecks) {
3694 this.skipChecks = skipChecks;
3695 }
3696
3697 public boolean isSkipChecks() {
3698 return skipChecks;
3699 }
3700
3701 public void setMerged(boolean isMerged) {
3702 this.isMerged = isMerged;
3703 }
3704
3705 public boolean isMerged() {
3706 return this.isMerged;
3707 }
3708 }
3709
3710 final static Comparator<HbckInfo> cmp = new Comparator<HbckInfo>() {
3711 @Override
3712 public int compare(HbckInfo l, HbckInfo r) {
3713 if (l == r) {
3714
3715 return 0;
3716 }
3717
3718 int tableCompare = l.getTableName().compareTo(r.getTableName());
3719 if (tableCompare != 0) {
3720 return tableCompare;
3721 }
3722
3723 int startComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3724 l.getStartKey(), r.getStartKey());
3725 if (startComparison != 0) {
3726 return startComparison;
3727 }
3728
3729
3730 byte[] endKey = r.getEndKey();
3731 endKey = (endKey.length == 0) ? null : endKey;
3732 byte[] endKey2 = l.getEndKey();
3733 endKey2 = (endKey2.length == 0) ? null : endKey2;
3734 int endComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3735 endKey2, endKey);
3736
3737 if (endComparison != 0) {
3738 return endComparison;
3739 }
3740
3741
3742
3743 if (l.hdfsEntry == null && r.hdfsEntry == null) {
3744 return 0;
3745 }
3746 if (l.hdfsEntry == null && r.hdfsEntry != null) {
3747 return 1;
3748 }
3749
3750 if (r.hdfsEntry == null) {
3751 return -1;
3752 }
3753
3754 return (int) (l.hdfsEntry.hri.getRegionId()- r.hdfsEntry.hri.getRegionId());
3755 }
3756 };
3757
3758
3759
3760
3761 private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
3762 StringBuilder sb = new StringBuilder();
3763 int numOfSkippedRegions;
3764 errors.print("Summary:");
3765 for (TableInfo tInfo : tablesInfo.values()) {
3766 numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ?
3767 skippedRegions.get(tInfo.getName()).size() : 0;
3768
3769 if (errors.tableHasErrors(tInfo)) {
3770 errors.print("Table " + tInfo.getName() + " is inconsistent.");
3771 } else if (numOfSkippedRegions > 0){
3772 errors.print("Table " + tInfo.getName() + " is okay (with "
3773 + numOfSkippedRegions + " skipped regions).");
3774 }
3775 else {
3776 errors.print("Table " + tInfo.getName() + " is okay.");
3777 }
3778 errors.print(" Number of regions: " + tInfo.getNumRegions());
3779 if (numOfSkippedRegions > 0) {
3780 Set<String> skippedRegionStrings = skippedRegions.get(tInfo.getName());
3781 System.out.println(" Number of skipped regions: " + numOfSkippedRegions);
3782 System.out.println(" List of skipped regions:");
3783 for(String sr : skippedRegionStrings) {
3784 System.out.println(" " + sr);
3785 }
3786 }
3787 sb.setLength(0);
3788 sb.append(" Deployed on: ");
3789 for (ServerName server : tInfo.deployedOn) {
3790 sb.append(" " + server.toString());
3791 }
3792 errors.print(sb.toString());
3793 }
3794 }
3795
3796 static ErrorReporter getErrorReporter(
3797 final Configuration conf) throws ClassNotFoundException {
3798 Class<? extends ErrorReporter> reporter = conf.getClass("hbasefsck.errorreporter", PrintingErrorReporter.class, ErrorReporter.class);
3799 return ReflectionUtils.newInstance(reporter, conf);
3800 }
3801
3802 public interface ErrorReporter {
3803 enum ERROR_CODE {
3804 UNKNOWN, NO_META_REGION, NULL_META_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
3805 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
3806 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
3807 FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
3808 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
3809 ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE,
3810 WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, ORPHANED_ZK_TABLE_ENTRY, BOUNDARIES_ERROR
3811 }
3812 void clear();
3813 void report(String message);
3814 void reportError(String message);
3815 void reportError(ERROR_CODE errorCode, String message);
3816 void reportError(ERROR_CODE errorCode, String message, TableInfo table);
3817 void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info);
3818 void reportError(
3819 ERROR_CODE errorCode,
3820 String message,
3821 TableInfo table,
3822 HbckInfo info1,
3823 HbckInfo info2
3824 );
3825 int summarize();
3826 void detail(String details);
3827 ArrayList<ERROR_CODE> getErrorList();
3828 void progress();
3829 void print(String message);
3830 void resetErrors();
3831 boolean tableHasErrors(TableInfo table);
3832 }
3833
3834 static class PrintingErrorReporter implements ErrorReporter {
3835 public int errorCount = 0;
3836 private int showProgress;
3837
3838 private static final int progressThreshold = 100;
3839
3840 Set<TableInfo> errorTables = new HashSet<TableInfo>();
3841
3842
3843 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
3844
3845 @Override
3846 public void clear() {
3847 errorTables.clear();
3848 errorList.clear();
3849 errorCount = 0;
3850 }
3851
3852 @Override
3853 public synchronized void reportError(ERROR_CODE errorCode, String message) {
3854 if (errorCode == ERROR_CODE.WRONG_USAGE) {
3855 System.err.println(message);
3856 return;
3857 }
3858
3859 errorList.add(errorCode);
3860 if (!summary) {
3861 System.out.println("ERROR: " + message);
3862 }
3863 errorCount++;
3864 showProgress = 0;
3865 }
3866
3867 @Override
3868 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
3869 errorTables.add(table);
3870 reportError(errorCode, message);
3871 }
3872
3873 @Override
3874 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3875 HbckInfo info) {
3876 errorTables.add(table);
3877 String reference = "(region " + info.getRegionNameAsString() + ")";
3878 reportError(errorCode, reference + " " + message);
3879 }
3880
3881 @Override
3882 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3883 HbckInfo info1, HbckInfo info2) {
3884 errorTables.add(table);
3885 String reference = "(regions " + info1.getRegionNameAsString()
3886 + " and " + info2.getRegionNameAsString() + ")";
3887 reportError(errorCode, reference + " " + message);
3888 }
3889
3890 @Override
3891 public synchronized void reportError(String message) {
3892 reportError(ERROR_CODE.UNKNOWN, message);
3893 }
3894
3895
3896
3897
3898
3899
3900 @Override
3901 public synchronized void report(String message) {
3902 if (! summary) {
3903 System.out.println("ERROR: " + message);
3904 }
3905 showProgress = 0;
3906 }
3907
3908 @Override
3909 public synchronized int summarize() {
3910 System.out.println(Integer.toString(errorCount) +
3911 " inconsistencies detected.");
3912 if (errorCount == 0) {
3913 System.out.println("Status: OK");
3914 return 0;
3915 } else {
3916 System.out.println("Status: INCONSISTENT");
3917 return -1;
3918 }
3919 }
3920
3921 @Override
3922 public ArrayList<ERROR_CODE> getErrorList() {
3923 return errorList;
3924 }
3925
3926 @Override
3927 public synchronized void print(String message) {
3928 if (!summary) {
3929 System.out.println(message);
3930 }
3931 }
3932
3933 @Override
3934 public boolean tableHasErrors(TableInfo table) {
3935 return errorTables.contains(table);
3936 }
3937
3938 @Override
3939 public void resetErrors() {
3940 errorCount = 0;
3941 }
3942
3943 @Override
3944 public synchronized void detail(String message) {
3945 if (details) {
3946 System.out.println(message);
3947 }
3948 showProgress = 0;
3949 }
3950
3951 @Override
3952 public synchronized void progress() {
3953 if (showProgress++ == progressThreshold) {
3954 if (!summary) {
3955 System.out.print(".");
3956 }
3957 showProgress = 0;
3958 }
3959 }
3960 }
3961
3962
3963
3964
3965 static class WorkItemRegion implements Callable<Void> {
3966 private HBaseFsck hbck;
3967 private ServerName rsinfo;
3968 private ErrorReporter errors;
3969 private HConnection connection;
3970
3971 WorkItemRegion(HBaseFsck hbck, ServerName info,
3972 ErrorReporter errors, HConnection connection) {
3973 this.hbck = hbck;
3974 this.rsinfo = info;
3975 this.errors = errors;
3976 this.connection = connection;
3977 }
3978
3979 @Override
3980 public synchronized Void call() throws IOException {
3981 errors.progress();
3982 try {
3983 BlockingInterface server = connection.getAdmin(rsinfo);
3984
3985
3986 List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
3987 regions = filterRegions(regions);
3988
3989 if (details) {
3990 errors.detail("RegionServer: " + rsinfo.getServerName() +
3991 " number of regions: " + regions.size());
3992 for (HRegionInfo rinfo: regions) {
3993 errors.detail(" " + rinfo.getRegionNameAsString() +
3994 " id: " + rinfo.getRegionId() +
3995 " encoded_name: " + rinfo.getEncodedName() +
3996 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
3997 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
3998 }
3999 }
4000
4001
4002 for (HRegionInfo r:regions) {
4003 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
4004 hbi.addServer(r, rsinfo);
4005 }
4006 } catch (IOException e) {
4007 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
4008 " Unable to fetch region information. " + e);
4009 throw e;
4010 }
4011 return null;
4012 }
4013
4014 private List<HRegionInfo> filterRegions(List<HRegionInfo> regions) {
4015 List<HRegionInfo> ret = Lists.newArrayList();
4016 for (HRegionInfo hri : regions) {
4017 if (hri.isMetaTable() || (!hbck.checkMetaOnly
4018 && hbck.isTableIncluded(hri.getTable()))) {
4019 ret.add(hri);
4020 }
4021 }
4022 return ret;
4023 }
4024 }
4025
4026
4027
4028
4029
4030 static class WorkItemHdfsDir implements Callable<Void> {
4031 private HBaseFsck hbck;
4032 private FileStatus tableDir;
4033 private ErrorReporter errors;
4034 private FileSystem fs;
4035
4036 WorkItemHdfsDir(HBaseFsck hbck, FileSystem fs, ErrorReporter errors,
4037 FileStatus status) {
4038 this.hbck = hbck;
4039 this.fs = fs;
4040 this.tableDir = status;
4041 this.errors = errors;
4042 }
4043
4044 @Override
4045 public synchronized Void call() throws IOException {
4046 try {
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103 static class WorkItemHdfsRegionInfo implements Callable<Void> {
4104 private HbckInfo hbi;
4105 private HBaseFsck hbck;
4106 private ErrorReporter errors;
4107
4108 WorkItemHdfsRegionInfo(HbckInfo hbi, HBaseFsck hbck, ErrorReporter errors) {
4109 this.hbi = hbi;
4110 this.hbck = hbck;
4111 this.errors = errors;
4112 }
4113
4114 @Override
4115 public synchronized Void call() throws IOException {
4116
4117 if (hbi.getHdfsHRI() == null) {
4118 try {
4119 errors.progress();
4120 hbck.loadHdfsRegioninfo(hbi);
4121 } catch (IOException ioe) {
4122 String msg = "Orphan region in HDFS: Unable to load .regioninfo from table "
4123 + hbi.getTableName() + " in hdfs dir "
4124 + hbi.getHdfsRegionDir()
4125 + "! It may be an invalid format or version file. Treating as "
4126 + "an orphaned regiondir.";
4127 errors.reportError(ERROR_CODE.ORPHAN_HDFS_REGION, msg);
4128 try {
4129 hbck.debugLsr(hbi.getHdfsRegionDir());
4130 } catch (IOException ioe2) {
4131 LOG.error("Unable to read directory " + hbi.getHdfsRegionDir(), ioe2);
4132 throw ioe2;
4133 }
4134 hbck.orphanHdfsDirs.add(hbi);
4135 throw ioe;
4136 }
4137 }
4138 return null;
4139 }
4140 };
4141
4142
4143
4144
4145
4146 public static void setDisplayFullReport() {
4147 details = true;
4148 }
4149
4150
4151
4152
4153
4154 void setSummary() {
4155 summary = true;
4156 }
4157
4158
4159
4160
4161
4162 void setCheckMetaOnly() {
4163 checkMetaOnly = true;
4164 }
4165
4166
4167
4168
4169 void setRegionBoundariesCheck() {
4170 checkRegionBoundaries = true;
4171 }
4172
4173
4174
4175
4176
4177 public void setFixTableLocks(boolean shouldFix) {
4178 fixTableLocks = shouldFix;
4179 fixAny |= shouldFix;
4180 }
4181
4182
4183
4184
4185
4186 public void setFixTableZNodes(boolean shouldFix) {
4187 fixTableZNodes = shouldFix;
4188 fixAny |= shouldFix;
4189 }
4190
4191
4192
4193
4194
4195
4196
4197 void setShouldRerun() {
4198 rerun = true;
4199 }
4200
4201 boolean shouldRerun() {
4202 return rerun;
4203 }
4204
4205
4206
4207
4208
4209 public void setFixAssignments(boolean shouldFix) {
4210 fixAssignments = shouldFix;
4211 fixAny |= shouldFix;
4212 }
4213
4214 boolean shouldFixAssignments() {
4215 return fixAssignments;
4216 }
4217
4218 public void setFixMeta(boolean shouldFix) {
4219 fixMeta = shouldFix;
4220 fixAny |= shouldFix;
4221 }
4222
4223 boolean shouldFixMeta() {
4224 return fixMeta;
4225 }
4226
4227 public void setFixEmptyMetaCells(boolean shouldFix) {
4228 fixEmptyMetaCells = shouldFix;
4229 fixAny |= shouldFix;
4230 }
4231
4232 boolean shouldFixEmptyMetaCells() {
4233 return fixEmptyMetaCells;
4234 }
4235
4236 public void setCheckHdfs(boolean checking) {
4237 checkHdfs = checking;
4238 }
4239
4240 boolean shouldCheckHdfs() {
4241 return checkHdfs;
4242 }
4243
4244 public void setFixHdfsHoles(boolean shouldFix) {
4245 fixHdfsHoles = shouldFix;
4246 fixAny |= shouldFix;
4247 }
4248
4249 boolean shouldFixHdfsHoles() {
4250 return fixHdfsHoles;
4251 }
4252
4253 public void setFixTableOrphans(boolean shouldFix) {
4254 fixTableOrphans = shouldFix;
4255 fixAny |= shouldFix;
4256 }
4257
4258 boolean shouldFixTableOrphans() {
4259 return fixTableOrphans;
4260 }
4261
4262 public void setFixHdfsOverlaps(boolean shouldFix) {
4263 fixHdfsOverlaps = shouldFix;
4264 fixAny |= shouldFix;
4265 }
4266
4267 boolean shouldFixHdfsOverlaps() {
4268 return fixHdfsOverlaps;
4269 }
4270
4271 public void setFixHdfsOrphans(boolean shouldFix) {
4272 fixHdfsOrphans = shouldFix;
4273 fixAny |= shouldFix;
4274 }
4275
4276 boolean shouldFixHdfsOrphans() {
4277 return fixHdfsOrphans;
4278 }
4279
4280 public void setFixVersionFile(boolean shouldFix) {
4281 fixVersionFile = shouldFix;
4282 fixAny |= shouldFix;
4283 }
4284
4285 public boolean shouldFixVersionFile() {
4286 return fixVersionFile;
4287 }
4288
4289 public void setSidelineBigOverlaps(boolean sbo) {
4290 this.sidelineBigOverlaps = sbo;
4291 }
4292
4293 public boolean shouldSidelineBigOverlaps() {
4294 return sidelineBigOverlaps;
4295 }
4296
4297 public void setFixSplitParents(boolean shouldFix) {
4298 fixSplitParents = shouldFix;
4299 fixAny |= shouldFix;
4300 }
4301
4302 boolean shouldFixSplitParents() {
4303 return fixSplitParents;
4304 }
4305
4306 public void setFixReferenceFiles(boolean shouldFix) {
4307 fixReferenceFiles = shouldFix;
4308 fixAny |= shouldFix;
4309 }
4310
4311 boolean shouldFixReferenceFiles() {
4312 return fixReferenceFiles;
4313 }
4314
4315 public boolean shouldIgnorePreCheckPermission() {
4316 return !fixAny || ignorePreCheckPermission;
4317 }
4318
4319 public void setIgnorePreCheckPermission(boolean ignorePreCheckPermission) {
4320 this.ignorePreCheckPermission = ignorePreCheckPermission;
4321 }
4322
4323
4324
4325
4326 public void setMaxMerge(int mm) {
4327 this.maxMerge = mm;
4328 }
4329
4330 public int getMaxMerge() {
4331 return maxMerge;
4332 }
4333
4334 public void setMaxOverlapsToSideline(int mo) {
4335 this.maxOverlapsToSideline = mo;
4336 }
4337
4338 public int getMaxOverlapsToSideline() {
4339 return maxOverlapsToSideline;
4340 }
4341
4342
4343
4344
4345
4346 boolean isTableIncluded(TableName table) {
4347 return (tablesIncluded.size() == 0) || tablesIncluded.contains(table);
4348 }
4349
4350 public void includeTable(TableName table) {
4351 tablesIncluded.add(table);
4352 }
4353
4354 Set<TableName> getIncludedTables() {
4355 return new HashSet<TableName>(tablesIncluded);
4356 }
4357
4358
4359
4360
4361
4362
4363 public void setTimeLag(long seconds) {
4364 timelag = seconds * 1000;
4365 }
4366
4367
4368
4369
4370
4371 public void setSidelineDir(String sidelineDir) {
4372 this.sidelineDir = new Path(sidelineDir);
4373 }
4374
4375 protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
4376 return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles);
4377 }
4378
4379 public HFileCorruptionChecker getHFilecorruptionChecker() {
4380 return hfcc;
4381 }
4382
4383 public void setHFileCorruptionChecker(HFileCorruptionChecker hfcc) {
4384 this.hfcc = hfcc;
4385 }
4386
4387 public void setRetCode(int code) {
4388 this.retcode = code;
4389 }
4390
4391 public int getRetCode() {
4392 return retcode;
4393 }
4394
4395 protected HBaseFsck printUsageAndExit() {
4396 StringWriter sw = new StringWriter(2048);
4397 PrintWriter out = new PrintWriter(sw);
4398 out.println("Usage: fsck [opts] {only tables}");
4399 out.println(" where [opts] are:");
4400 out.println(" -help Display help options (this)");
4401 out.println(" -details Display full report of all regions.");
4402 out.println(" -timelag <timeInSeconds> Process only regions that " +
4403 " have not experienced any metadata updates in the last " +
4404 " <timeInSeconds> seconds.");
4405 out.println(" -sleepBeforeRerun <timeInSeconds> Sleep this many seconds" +
4406 " before checking if the fix worked if run with -fix");
4407 out.println(" -summary Print only summary of the tables and status.");
4408 out.println(" -metaonly Only check the state of the hbase:meta table.");
4409 out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
4410 out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
4411
4412 out.println("");
4413 out.println(" Metadata Repair options: (expert features, use with caution!)");
4414 out.println(" -fix Try to fix region assignments. This is for backwards compatiblity");
4415 out.println(" -fixAssignments Try to fix region assignments. Replaces the old -fix");
4416 out.println(" -fixMeta Try to fix meta problems. This assumes HDFS region info is good.");
4417 out.println(" -noHdfsChecking Don't load/check region info from HDFS."
4418 + " Assumes hbase:meta region info is good. Won't check/fix any HDFS issue, e.g. hole, orphan, or overlap");
4419 out.println(" -fixHdfsHoles Try to fix region holes in hdfs.");
4420 out.println(" -fixHdfsOrphans Try to fix region dirs with no .regioninfo file in hdfs");
4421 out.println(" -fixTableOrphans Try to fix table dirs with no .tableinfo file in hdfs (online mode only)");
4422 out.println(" -fixHdfsOverlaps Try to fix region overlaps in hdfs.");
4423 out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs.");
4424 out.println(" -maxMerge <n> When fixing region overlaps, allow at most <n> regions to merge. (n=" + DEFAULT_MAX_MERGE +" by default)");
4425 out.println(" -sidelineBigOverlaps When fixing region overlaps, allow to sideline big overlaps");
4426 out.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
4427 out.println(" -fixSplitParents Try to force offline split parents to be online.");
4428 out.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
4429 out.println(" -fixReferenceFiles Try to offline lingering reference store files");
4430 out.println(" -fixEmptyMetaCells Try to fix hbase:meta entries not referencing any region"
4431 + " (empty REGIONINFO_QUALIFIER rows)");
4432
4433 out.println("");
4434 out.println(" Datafile Repair options: (expert features, use with caution!)");
4435 out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid");
4436 out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles");
4437
4438 out.println("");
4439 out.println(" Metadata Repair shortcuts");
4440 out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
4441 "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps " +
4442 "-fixReferenceFiles -fixTableLocks -fixOrphanedTableZnodes");
4443 out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
4444
4445 out.println("");
4446 out.println(" Table lock options");
4447 out.println(" -fixTableLocks Deletes table locks held for a long time (hbase.table.lock.expire.ms, 10min by default)");
4448
4449 out.println("");
4450 out.println(" Table Znode options");
4451 out.println(" -fixOrphanedTableZnodes Set table state in ZNode to disabled if table does not exists");
4452
4453 out.flush();
4454 errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString());
4455
4456 setRetCode(-2);
4457 return this;
4458 }
4459
4460
4461
4462
4463
4464
4465
4466 public static void main(String[] args) throws Exception {
4467
4468 Configuration conf = HBaseConfiguration.create();
4469 Path hbasedir = FSUtils.getRootDir(conf);
4470 URI defaultFs = hbasedir.getFileSystem(conf).getUri();
4471 FSUtils.setFsDefault(conf, new Path(defaultFs));
4472 int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
4473 System.exit(ret);
4474 }
4475
4476
4477
4478
4479 static class HBaseFsckTool extends Configured implements Tool {
4480 HBaseFsckTool(Configuration conf) { super(conf); }
4481 @Override
4482 public int run(String[] args) throws Exception {
4483 HBaseFsck hbck = new HBaseFsck(getConf());
4484 hbck.exec(hbck.executor, args);
4485 hbck.close();
4486 return hbck.getRetCode();
4487 }
4488 };
4489
4490
4491 public HBaseFsck exec(ExecutorService exec, String[] args) throws KeeperException, IOException,
4492 ServiceException, InterruptedException {
4493 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
4494
4495 boolean checkCorruptHFiles = false;
4496 boolean sidelineCorruptHFiles = false;
4497
4498
4499 for (int i = 0; i < args.length; i++) {
4500 String cmd = args[i];
4501 if (cmd.equals("-help") || cmd.equals("-h")) {
4502 return printUsageAndExit();
4503 } else if (cmd.equals("-details")) {
4504 setDisplayFullReport();
4505 } else if (cmd.equals("-timelag")) {
4506 if (i == args.length - 1) {
4507 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
4508 return printUsageAndExit();
4509 }
4510 try {
4511 long timelag = Long.parseLong(args[i+1]);
4512 setTimeLag(timelag);
4513 } catch (NumberFormatException e) {
4514 errors.reportError(ERROR_CODE.WRONG_USAGE, "-timelag needs a numeric value.");
4515 return printUsageAndExit();
4516 }
4517 i++;
4518 } else if (cmd.equals("-sleepBeforeRerun")) {
4519 if (i == args.length - 1) {
4520 errors.reportError(ERROR_CODE.WRONG_USAGE,
4521 "HBaseFsck: -sleepBeforeRerun needs a value.");
4522 return printUsageAndExit();
4523 }
4524 try {
4525 sleepBeforeRerun = Long.parseLong(args[i+1]);
4526 } catch (NumberFormatException e) {
4527 errors.reportError(ERROR_CODE.WRONG_USAGE, "-sleepBeforeRerun needs a numeric value.");
4528 return printUsageAndExit();
4529 }
4530 i++;
4531 } else if (cmd.equals("-sidelineDir")) {
4532 if (i == args.length - 1) {
4533 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -sidelineDir needs a value.");
4534 return printUsageAndExit();
4535 }
4536 i++;
4537 setSidelineDir(args[i]);
4538 } else if (cmd.equals("-fix")) {
4539 errors.reportError(ERROR_CODE.WRONG_USAGE,
4540 "This option is deprecated, please use -fixAssignments instead.");
4541 setFixAssignments(true);
4542 } else if (cmd.equals("-fixAssignments")) {
4543 setFixAssignments(true);
4544 } else if (cmd.equals("-fixMeta")) {
4545 setFixMeta(true);
4546 } else if (cmd.equals("-noHdfsChecking")) {
4547 setCheckHdfs(false);
4548 } else if (cmd.equals("-fixHdfsHoles")) {
4549 setFixHdfsHoles(true);
4550 } else if (cmd.equals("-fixHdfsOrphans")) {
4551 setFixHdfsOrphans(true);
4552 } else if (cmd.equals("-fixTableOrphans")) {
4553 setFixTableOrphans(true);
4554 } else if (cmd.equals("-fixHdfsOverlaps")) {
4555 setFixHdfsOverlaps(true);
4556 } else if (cmd.equals("-fixVersionFile")) {
4557 setFixVersionFile(true);
4558 } else if (cmd.equals("-sidelineBigOverlaps")) {
4559 setSidelineBigOverlaps(true);
4560 } else if (cmd.equals("-fixSplitParents")) {
4561 setFixSplitParents(true);
4562 } else if (cmd.equals("-ignorePreCheckPermission")) {
4563 setIgnorePreCheckPermission(true);
4564 } else if (cmd.equals("-checkCorruptHFiles")) {
4565 checkCorruptHFiles = true;
4566 } else if (cmd.equals("-sidelineCorruptHFiles")) {
4567 sidelineCorruptHFiles = true;
4568 } else if (cmd.equals("-fixReferenceFiles")) {
4569 setFixReferenceFiles(true);
4570 } else if (cmd.equals("-fixEmptyMetaCells")) {
4571 setFixEmptyMetaCells(true);
4572 } else if (cmd.equals("-repair")) {
4573
4574
4575 setFixHdfsHoles(true);
4576 setFixHdfsOrphans(true);
4577 setFixMeta(true);
4578 setFixAssignments(true);
4579 setFixHdfsOverlaps(true);
4580 setFixVersionFile(true);
4581 setSidelineBigOverlaps(true);
4582 setFixSplitParents(false);
4583 setCheckHdfs(true);
4584 setFixReferenceFiles(true);
4585 setFixTableLocks(true);
4586 setFixTableZNodes(true);
4587 } else if (cmd.equals("-repairHoles")) {
4588
4589 setFixHdfsHoles(true);
4590 setFixHdfsOrphans(false);
4591 setFixMeta(true);
4592 setFixAssignments(true);
4593 setFixHdfsOverlaps(false);
4594 setSidelineBigOverlaps(false);
4595 setFixSplitParents(false);
4596 setCheckHdfs(true);
4597 } else if (cmd.equals("-maxOverlapsToSideline")) {
4598 if (i == args.length - 1) {
4599 errors.reportError(ERROR_CODE.WRONG_USAGE,
4600 "-maxOverlapsToSideline needs a numeric value argument.");
4601 return printUsageAndExit();
4602 }
4603 try {
4604 int maxOverlapsToSideline = Integer.parseInt(args[i+1]);
4605 setMaxOverlapsToSideline(maxOverlapsToSideline);
4606 } catch (NumberFormatException e) {
4607 errors.reportError(ERROR_CODE.WRONG_USAGE,
4608 "-maxOverlapsToSideline needs a numeric value argument.");
4609 return printUsageAndExit();
4610 }
4611 i++;
4612 } else if (cmd.equals("-maxMerge")) {
4613 if (i == args.length - 1) {
4614 errors.reportError(ERROR_CODE.WRONG_USAGE,
4615 "-maxMerge needs a numeric value argument.");
4616 return printUsageAndExit();
4617 }
4618 try {
4619 int maxMerge = Integer.parseInt(args[i+1]);
4620 setMaxMerge(maxMerge);
4621 } catch (NumberFormatException e) {
4622 errors.reportError(ERROR_CODE.WRONG_USAGE,
4623 "-maxMerge needs a numeric value argument.");
4624 return printUsageAndExit();
4625 }
4626 i++;
4627 } else if (cmd.equals("-summary")) {
4628 setSummary();
4629 } else if (cmd.equals("-metaonly")) {
4630 setCheckMetaOnly();
4631 } else if (cmd.equals("-boundaries")) {
4632 setRegionBoundariesCheck();
4633 } else if (cmd.equals("-fixTableLocks")) {
4634 setFixTableLocks(true);
4635 } else if (cmd.equals("-fixOrphanedTableZnodes")) {
4636 setFixTableZNodes(true);
4637 } else if (cmd.startsWith("-")) {
4638 errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd);
4639 return printUsageAndExit();
4640 } else {
4641 includeTable(TableName.valueOf(cmd));
4642 errors.print("Allow checking/fixes for table: " + cmd);
4643 }
4644 }
4645
4646 errors.print("HBaseFsck command line options: " + StringUtils.join(args, " "));
4647
4648
4649 try {
4650 preCheckPermission();
4651 } catch (AccessDeniedException ace) {
4652 Runtime.getRuntime().exit(-1);
4653 } catch (IOException ioe) {
4654 Runtime.getRuntime().exit(-1);
4655 }
4656
4657
4658 connect();
4659
4660 try {
4661
4662 if (checkCorruptHFiles || sidelineCorruptHFiles) {
4663 LOG.info("Checking all hfiles for corruption");
4664 HFileCorruptionChecker hfcc = createHFileCorruptionChecker(sidelineCorruptHFiles);
4665 setHFileCorruptionChecker(hfcc);
4666 Collection<TableName> tables = getIncludedTables();
4667 Collection<Path> tableDirs = new ArrayList<Path>();
4668 Path rootdir = FSUtils.getRootDir(getConf());
4669 if (tables.size() > 0) {
4670 for (TableName t : tables) {
4671 tableDirs.add(FSUtils.getTableDir(rootdir, t));
4672 }
4673 } else {
4674 tableDirs = FSUtils.getTableDirs(FSUtils.getCurrentFileSystem(getConf()), rootdir);
4675 }
4676 hfcc.checkTables(tableDirs);
4677 hfcc.report(errors);
4678 }
4679
4680
4681 int code = onlineHbck();
4682 setRetCode(code);
4683
4684
4685
4686
4687 if (shouldRerun()) {
4688 try {
4689 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
4690 Thread.sleep(sleepBeforeRerun);
4691 } catch (InterruptedException ie) {
4692 LOG.warn("Interrupted while sleeping");
4693 return this;
4694 }
4695
4696 setFixAssignments(false);
4697 setFixMeta(false);
4698 setFixHdfsHoles(false);
4699 setFixHdfsOverlaps(false);
4700 setFixVersionFile(false);
4701 setFixTableOrphans(false);
4702 errors.resetErrors();
4703 code = onlineHbck();
4704 setRetCode(code);
4705 }
4706 } finally {
4707 IOUtils.cleanup(null, this);
4708 }
4709 return this;
4710 }
4711
4712
4713
4714
4715 void debugLsr(Path p) throws IOException {
4716 debugLsr(getConf(), p, errors);
4717 }
4718
4719
4720
4721
4722 public static void debugLsr(Configuration conf,
4723 Path p) throws IOException {
4724 debugLsr(conf, p, new PrintingErrorReporter());
4725 }
4726
4727
4728
4729
4730 public static void debugLsr(Configuration conf,
4731 Path p, ErrorReporter errors) throws IOException {
4732 if (!LOG.isDebugEnabled() || p == null) {
4733 return;
4734 }
4735 FileSystem fs = p.getFileSystem(conf);
4736
4737 if (!fs.exists(p)) {
4738
4739 return;
4740 }
4741 errors.print(p.toString());
4742
4743 if (fs.isFile(p)) {
4744 return;
4745 }
4746
4747 if (fs.getFileStatus(p).isDirectory()) {
4748 FileStatus[] fss= fs.listStatus(p);
4749 for (FileStatus status : fss) {
4750 debugLsr(conf, status.getPath(), errors);
4751 }
4752 }
4753 }
4754 }