001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.snapshot; 019 020import java.io.FileNotFoundException; 021import java.io.IOException; 022import java.util.ArrayList; 023import java.util.Collections; 024import java.util.HashMap; 025import java.util.HashSet; 026import java.util.Iterator; 027import java.util.List; 028import java.util.Map; 029import java.util.Set; 030import java.util.concurrent.ConcurrentHashMap; 031import java.util.concurrent.Executors; 032import java.util.concurrent.ScheduledExecutorService; 033import java.util.concurrent.ScheduledFuture; 034import java.util.concurrent.ThreadPoolExecutor; 035import java.util.concurrent.TimeUnit; 036import java.util.concurrent.locks.ReadWriteLock; 037import java.util.concurrent.locks.ReentrantReadWriteLock; 038import org.apache.hadoop.conf.Configuration; 039import org.apache.hadoop.fs.FSDataInputStream; 040import org.apache.hadoop.fs.FileStatus; 041import org.apache.hadoop.fs.FileSystem; 042import org.apache.hadoop.fs.Path; 043import org.apache.hadoop.hbase.HBaseInterfaceAudience; 044import org.apache.hadoop.hbase.HConstants; 045import org.apache.hadoop.hbase.Stoppable; 046import org.apache.hadoop.hbase.TableName; 047import org.apache.hadoop.hbase.client.TableDescriptor; 048import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 049import org.apache.hadoop.hbase.client.TableState; 050import org.apache.hadoop.hbase.errorhandling.ForeignException; 051import org.apache.hadoop.hbase.executor.ExecutorService; 052import org.apache.hadoop.hbase.ipc.RpcServer; 053import org.apache.hadoop.hbase.master.MasterCoprocessorHost; 054import org.apache.hadoop.hbase.master.MasterFileSystem; 055import org.apache.hadoop.hbase.master.MasterServices; 056import org.apache.hadoop.hbase.master.MetricsMaster; 057import org.apache.hadoop.hbase.master.SnapshotSentinel; 058import org.apache.hadoop.hbase.master.cleaner.HFileCleaner; 059import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner; 060import org.apache.hadoop.hbase.master.procedure.CloneSnapshotProcedure; 061import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 062import org.apache.hadoop.hbase.master.procedure.RestoreSnapshotProcedure; 063import org.apache.hadoop.hbase.procedure.MasterProcedureManager; 064import org.apache.hadoop.hbase.procedure.Procedure; 065import org.apache.hadoop.hbase.procedure.ProcedureCoordinator; 066import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs; 067import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinator; 068import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 069import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerValidationUtils; 070import org.apache.hadoop.hbase.security.AccessDeniedException; 071import org.apache.hadoop.hbase.security.User; 072import org.apache.hadoop.hbase.security.access.AccessChecker; 073import org.apache.hadoop.hbase.security.access.SnapshotScannerHDFSAclCleaner; 074import org.apache.hadoop.hbase.security.access.SnapshotScannerHDFSAclHelper; 075import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; 076import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException; 077import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException; 078import org.apache.hadoop.hbase.snapshot.SnapshotCreationException; 079import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; 080import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException; 081import org.apache.hadoop.hbase.snapshot.SnapshotExistsException; 082import org.apache.hadoop.hbase.snapshot.SnapshotManifest; 083import org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil; 084import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException; 085import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException; 086import org.apache.hadoop.hbase.util.CommonFSUtils; 087import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 088import org.apache.hadoop.hbase.util.NonceKey; 089import org.apache.hadoop.hbase.util.TableDescriptorChecker; 090import org.apache.yetus.audience.InterfaceAudience; 091import org.apache.yetus.audience.InterfaceStability; 092import org.apache.zookeeper.KeeperException; 093import org.slf4j.Logger; 094import org.slf4j.LoggerFactory; 095 096import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; 097 098import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 099import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.NameStringPair; 100import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.ProcedureDescription; 101import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; 102import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription.Type; 103 104/** 105 * This class manages the procedure of taking and restoring snapshots. There is only one 106 * SnapshotManager for the master. 107 * <p> 108 * The class provides methods for monitoring in-progress snapshot actions. 109 * <p> 110 * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a 111 * simplification in the current implementation. 112 */ 113@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG) 114@InterfaceStability.Unstable 115public class SnapshotManager extends MasterProcedureManager implements Stoppable { 116 private static final Logger LOG = LoggerFactory.getLogger(SnapshotManager.class); 117 118 /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */ 119 private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500; 120 121 /** 122 * Wait time before removing a finished sentinel from the in-progress map NOTE: This is used as a 123 * safety auto cleanup. The snapshot and restore handlers map entries are removed when a user asks 124 * if a snapshot or restore is completed. This operation is part of the HBaseAdmin 125 * snapshot/restore API flow. In case something fails on the client side and the snapshot/restore 126 * state is not reclaimed after a default timeout, the entry is removed from the in-progress map. 127 * At this point, if the user asks for the snapshot/restore status, the result will be snapshot 128 * done if exists or failed if it doesn't exists. 129 */ 130 public static final String HBASE_SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT_MILLIS = 131 "hbase.snapshot.sentinels.cleanup.timeoutMillis"; 132 public static final long SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT_MILLS_DEFAULT = 60 * 1000L; 133 134 /** Enable or disable snapshot support */ 135 public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled"; 136 137 /** 138 * Conf key for # of ms elapsed between checks for snapshot errors while waiting for completion. 139 */ 140 private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis"; 141 142 /** Name of the operation to use in the controller */ 143 public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot"; 144 145 /** Conf key for # of threads used by the SnapshotManager thread pool */ 146 public static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads"; 147 148 /** number of current operations running on the master */ 149 public static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1; 150 151 /** Conf key for preserving original max file size configs */ 152 public static final String SNAPSHOT_MAX_FILE_SIZE_PRESERVE = 153 "hbase.snapshot.max.filesize.preserve"; 154 155 private boolean stopped; 156 private MasterServices master; // Needed by TableEventHandlers 157 private ProcedureCoordinator coordinator; 158 159 // Is snapshot feature enabled? 160 private boolean isSnapshotSupported = false; 161 162 // Snapshot handlers map, with table name as key. 163 // The map is always accessed and modified under the object lock using synchronized. 164 // snapshotTable() will insert an Handler in the table. 165 // isSnapshotDone() will remove the handler requested if the operation is finished. 166 private final Map<TableName, SnapshotSentinel> snapshotHandlers = new ConcurrentHashMap<>(); 167 private final ScheduledExecutorService scheduleThreadPool = 168 Executors.newScheduledThreadPool(1, new ThreadFactoryBuilder() 169 .setNameFormat("SnapshotHandlerChoreCleaner").setDaemon(true).build()); 170 private ScheduledFuture<?> snapshotHandlerChoreCleanerTask; 171 172 // Restore map, with table name as key, procedure ID as value. 173 // The map is always accessed and modified under the object lock using synchronized. 174 // restoreSnapshot()/cloneSnapshot() will insert a procedure ID in the map. 175 // 176 // TODO: just as the Apache HBase 1.x implementation, this map would not survive master 177 // restart/failover. This is just a stopgap implementation until implementation of taking 178 // snapshot using Procedure-V2. 179 private Map<TableName, Long> restoreTableToProcIdMap = new HashMap<>(); 180 181 private Path rootDir; 182 private ExecutorService executorService; 183 184 /** 185 * Read write lock between taking snapshot and snapshot HFile cleaner. The cleaner should skip to 186 * check the HFiles if any snapshot is in progress, otherwise it may clean a HFile which would 187 * belongs to the newly creating snapshot. So we should grab the write lock first when cleaner 188 * start to work. (See HBASE-21387) 189 */ 190 private ReentrantReadWriteLock takingSnapshotLock = new ReentrantReadWriteLock(true); 191 192 public SnapshotManager() { 193 } 194 195 /** 196 * Fully specify all necessary components of a snapshot manager. Exposed for testing. 197 * @param master services for the master where the manager is running 198 * @param coordinator procedure coordinator instance. exposed for testing. 199 * @param pool HBase ExecutorServcie instance, exposed for testing. 200 */ 201 @InterfaceAudience.Private 202 SnapshotManager(final MasterServices master, ProcedureCoordinator coordinator, 203 ExecutorService pool, int sentinelCleanInterval) 204 throws IOException, UnsupportedOperationException { 205 this.master = master; 206 207 this.rootDir = master.getMasterFileSystem().getRootDir(); 208 Configuration conf = master.getConfiguration(); 209 checkSnapshotSupport(conf, master.getMasterFileSystem()); 210 211 this.coordinator = coordinator; 212 this.executorService = pool; 213 resetTempDir(); 214 snapshotHandlerChoreCleanerTask = this.scheduleThreadPool.scheduleAtFixedRate( 215 this::cleanupSentinels, sentinelCleanInterval, sentinelCleanInterval, TimeUnit.SECONDS); 216 } 217 218 /** 219 * Gets the list of all completed snapshots. 220 * @return list of SnapshotDescriptions 221 * @throws IOException File system exception 222 */ 223 public List<SnapshotDescription> getCompletedSnapshots() throws IOException { 224 return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir), true); 225 } 226 227 /** 228 * Gets the list of all completed snapshots. 229 * @param snapshotDir snapshot directory 230 * @param withCpCall Whether to call CP hooks 231 * @return list of SnapshotDescriptions 232 * @throws IOException File system exception 233 */ 234 private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir, boolean withCpCall) 235 throws IOException { 236 List<SnapshotDescription> snapshotDescs = new ArrayList<>(); 237 // first create the snapshot root path and check to see if it exists 238 FileSystem fs = master.getMasterFileSystem().getFileSystem(); 239 if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir); 240 241 // if there are no snapshots, return an empty list 242 if (!fs.exists(snapshotDir)) { 243 return snapshotDescs; 244 } 245 246 // ignore all the snapshots in progress 247 FileStatus[] snapshots = fs.listStatus(snapshotDir, 248 new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs)); 249 MasterCoprocessorHost cpHost = master.getMasterCoprocessorHost(); 250 withCpCall = withCpCall && cpHost != null; 251 // loop through all the completed snapshots 252 for (FileStatus snapshot : snapshots) { 253 Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE); 254 // if the snapshot is bad 255 if (!fs.exists(info)) { 256 LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist"); 257 continue; 258 } 259 FSDataInputStream in = null; 260 try { 261 in = fs.open(info); 262 SnapshotDescription desc = SnapshotDescription.parseFrom(in); 263 org.apache.hadoop.hbase.client.SnapshotDescription descPOJO = 264 (withCpCall) ? ProtobufUtil.createSnapshotDesc(desc) : null; 265 if (withCpCall) { 266 try { 267 cpHost.preListSnapshot(descPOJO); 268 } catch (AccessDeniedException e) { 269 LOG.warn("Current user does not have access to " + desc.getName() + " snapshot. " 270 + "Either you should be owner of this snapshot or admin user."); 271 // Skip this and try for next snapshot 272 continue; 273 } 274 } 275 snapshotDescs.add(desc); 276 277 // call coproc post hook 278 if (withCpCall) { 279 cpHost.postListSnapshot(descPOJO); 280 } 281 } catch (IOException e) { 282 LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e); 283 } finally { 284 if (in != null) { 285 in.close(); 286 } 287 } 288 } 289 return snapshotDescs; 290 } 291 292 /** 293 * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed snapshot 294 * attempts. 295 * @throws IOException if we can't reach the filesystem 296 */ 297 private void resetTempDir() throws IOException { 298 // cleanup any existing snapshots. 299 Path tmpdir = 300 SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir, master.getConfiguration()); 301 FileSystem tmpFs = tmpdir.getFileSystem(master.getConfiguration()); 302 if (!tmpFs.delete(tmpdir, true)) { 303 LOG.warn("Couldn't delete working snapshot directory: " + tmpdir); 304 } 305 } 306 307 /** 308 * Delete the specified snapshot n * @throws SnapshotDoesNotExistException If the specified 309 * snapshot does not exist. 310 * @throws IOException For filesystem IOExceptions 311 */ 312 public void deleteSnapshot(SnapshotDescription snapshot) throws IOException { 313 // check to see if it is completed 314 if (!isSnapshotCompleted(snapshot)) { 315 throw new SnapshotDoesNotExistException(ProtobufUtil.createSnapshotDesc(snapshot)); 316 } 317 318 String snapshotName = snapshot.getName(); 319 // first create the snapshot description and check to see if it exists 320 FileSystem fs = master.getMasterFileSystem().getFileSystem(); 321 Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir); 322 // Get snapshot info from file system. The one passed as parameter is a "fake" snapshotInfo with 323 // just the "name" and it does not contains the "real" snapshot information 324 snapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir); 325 326 // call coproc pre hook 327 MasterCoprocessorHost cpHost = master.getMasterCoprocessorHost(); 328 org.apache.hadoop.hbase.client.SnapshotDescription snapshotPOJO = null; 329 if (cpHost != null) { 330 snapshotPOJO = ProtobufUtil.createSnapshotDesc(snapshot); 331 cpHost.preDeleteSnapshot(snapshotPOJO); 332 } 333 334 LOG.debug("Deleting snapshot: " + snapshotName); 335 // delete the existing snapshot 336 if (!fs.delete(snapshotDir, true)) { 337 throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir); 338 } 339 340 // call coproc post hook 341 if (cpHost != null) { 342 cpHost.postDeleteSnapshot(snapshotPOJO); 343 } 344 345 } 346 347 /** 348 * Check if the specified snapshot is done n * @return true if snapshot is ready to be restored, 349 * false if it is still being taken. 350 * @throws IOException IOException if error from HDFS or RPC 351 * @throws UnknownSnapshotException if snapshot is invalid or does not exist. 352 */ 353 public boolean isSnapshotDone(SnapshotDescription expected) throws IOException { 354 // check the request to make sure it has a snapshot 355 if (expected == null) { 356 throw new UnknownSnapshotException( 357 "No snapshot name passed in request, can't figure out which snapshot you want to check."); 358 } 359 360 String ssString = ClientSnapshotDescriptionUtils.toString(expected); 361 362 // check to see if the sentinel exists, 363 // and if the task is complete removes it from the in-progress snapshots map. 364 SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected); 365 366 // stop tracking "abandoned" handlers 367 cleanupSentinels(); 368 369 if (handler == null) { 370 // If there's no handler in the in-progress map, it means one of the following: 371 // - someone has already requested the snapshot state 372 // - the requested snapshot was completed long time ago (cleanupSentinels() timeout) 373 // - the snapshot was never requested 374 // In those cases returns to the user the "done state" if the snapshots exists on disk, 375 // otherwise raise an exception saying that the snapshot is not running and doesn't exist. 376 if (!isSnapshotCompleted(expected)) { 377 throw new UnknownSnapshotException("Snapshot " + ssString 378 + " is not currently running or one of the known completed snapshots."); 379 } 380 // was done, return true; 381 return true; 382 } 383 384 // pass on any failure we find in the sentinel 385 try { 386 handler.rethrowExceptionIfFailed(); 387 } catch (ForeignException e) { 388 // Give some procedure info on an exception. 389 String status; 390 Procedure p = coordinator.getProcedure(expected.getName()); 391 if (p != null) { 392 status = p.getStatus(); 393 } else { 394 status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames(); 395 } 396 throw new HBaseSnapshotException("Snapshot " + ssString + " had an error. " + status, e, 397 ProtobufUtil.createSnapshotDesc(expected)); 398 } 399 400 // check to see if we are done 401 if (handler.isFinished()) { 402 LOG.debug("Snapshot '" + ssString + "' has completed, notifying client."); 403 return true; 404 } else if (LOG.isDebugEnabled()) { 405 LOG.debug("Snapshoting '" + ssString + "' is still in progress!"); 406 } 407 return false; 408 } 409 410 /** 411 * Check to see if there is a snapshot in progress with the same name or on the same table. 412 * Currently we have a limitation only allowing a single snapshot per table at a time. Also we 413 * don't allow snapshot with the same name. 414 * @param snapshot description of the snapshot being checked. 415 * @return <tt>true</tt> if there is a snapshot in progress with the same name or on the same 416 * table. 417 */ 418 synchronized boolean isTakingSnapshot(final SnapshotDescription snapshot) { 419 TableName snapshotTable = TableName.valueOf(snapshot.getTable()); 420 if (isTakingSnapshot(snapshotTable)) { 421 return true; 422 } 423 Iterator<Map.Entry<TableName, SnapshotSentinel>> it = 424 this.snapshotHandlers.entrySet().iterator(); 425 while (it.hasNext()) { 426 Map.Entry<TableName, SnapshotSentinel> entry = it.next(); 427 SnapshotSentinel sentinel = entry.getValue(); 428 if (snapshot.getName().equals(sentinel.getSnapshot().getName()) && !sentinel.isFinished()) { 429 return true; 430 } 431 } 432 return false; 433 } 434 435 /** 436 * Check to see if the specified table has a snapshot in progress. Currently we have a limitation 437 * only allowing a single snapshot per table at a time. 438 * @param tableName name of the table being snapshotted. 439 * @return <tt>true</tt> if there is a snapshot in progress on the specified table. 440 */ 441 public boolean isTakingSnapshot(final TableName tableName) { 442 SnapshotSentinel handler = this.snapshotHandlers.get(tableName); 443 return handler != null && !handler.isFinished(); 444 } 445 446 /** 447 * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we 448 * aren't already running a snapshot or restore on the requested table. 449 * @param snapshot description of the snapshot we want to start 450 * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot 451 */ 452 private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot) 453 throws HBaseSnapshotException { 454 Path workingDir = 455 SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, master.getConfiguration()); 456 TableName snapshotTable = TableName.valueOf(snapshot.getTable()); 457 458 // make sure we aren't already running a snapshot 459 if (isTakingSnapshot(snapshot)) { 460 SnapshotSentinel handler = this.snapshotHandlers.get(snapshotTable); 461 throw new SnapshotCreationException("Rejected taking " 462 + ClientSnapshotDescriptionUtils.toString(snapshot) 463 + " because we are already running another snapshot " 464 + (handler != null 465 ? ("on the same table " + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot())) 466 : "with the same name"), 467 ProtobufUtil.createSnapshotDesc(snapshot)); 468 } 469 470 // make sure we aren't running a restore on the same table 471 if (isRestoringTable(snapshotTable)) { 472 throw new SnapshotCreationException( 473 "Rejected taking " + ClientSnapshotDescriptionUtils.toString(snapshot) 474 + " because we are already have a restore in progress on the same snapshot."); 475 } 476 477 try { 478 FileSystem workingDirFS = workingDir.getFileSystem(master.getConfiguration()); 479 // delete the working directory, since we aren't running the snapshot. Likely leftovers 480 // from a failed attempt. 481 workingDirFS.delete(workingDir, true); 482 483 // recreate the working directory for the snapshot 484 if (!workingDirFS.mkdirs(workingDir)) { 485 throw new SnapshotCreationException( 486 "Couldn't create working directory (" + workingDir + ") for snapshot", 487 ProtobufUtil.createSnapshotDesc(snapshot)); 488 } 489 } catch (HBaseSnapshotException e) { 490 throw e; 491 } catch (IOException e) { 492 throw new SnapshotCreationException( 493 "Exception while checking to see if snapshot could be started.", e, 494 ProtobufUtil.createSnapshotDesc(snapshot)); 495 } 496 } 497 498 /** 499 * Take a snapshot of a disabled table. 500 * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}. 501 * @throws IOException if the snapshot could not be started or filesystem for snapshot temporary 502 * directory could not be determined 503 */ 504 private synchronized void snapshotDisabledTable(SnapshotDescription snapshot) throws IOException { 505 // setup the snapshot 506 prepareToTakeSnapshot(snapshot); 507 508 // set the snapshot to be a disabled snapshot, since the client doesn't know about that 509 snapshot = snapshot.toBuilder().setType(Type.DISABLED).build(); 510 511 // Take the snapshot of the disabled table 512 DisabledTableSnapshotHandler handler = new DisabledTableSnapshotHandler(snapshot, master, this); 513 snapshotTable(snapshot, handler); 514 } 515 516 /** 517 * Take a snapshot of an enabled table. 518 * @param snapshot description of the snapshot to take. 519 * @throws IOException if the snapshot could not be started or filesystem for snapshot temporary 520 * directory could not be determined 521 */ 522 private synchronized void snapshotEnabledTable(SnapshotDescription snapshot) throws IOException { 523 // setup the snapshot 524 prepareToTakeSnapshot(snapshot); 525 526 // Take the snapshot of the enabled table 527 EnabledTableSnapshotHandler handler = new EnabledTableSnapshotHandler(snapshot, master, this); 528 snapshotTable(snapshot, handler); 529 } 530 531 /** 532 * Take a snapshot using the specified handler. On failure the snapshot temporary working 533 * directory is removed. NOTE: prepareToTakeSnapshot() called before this one takes care of the 534 * rejecting the snapshot request if the table is busy with another snapshot/restore operation. 535 * @param snapshot the snapshot description 536 * @param handler the snapshot handler 537 */ 538 private synchronized void snapshotTable(SnapshotDescription snapshot, 539 final TakeSnapshotHandler handler) throws IOException { 540 try { 541 handler.prepare(); 542 this.executorService.submit(handler); 543 this.snapshotHandlers.put(TableName.valueOf(snapshot.getTable()), handler); 544 } catch (Exception e) { 545 // cleanup the working directory by trying to delete it from the fs. 546 Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, 547 master.getConfiguration()); 548 FileSystem workingDirFs = workingDir.getFileSystem(master.getConfiguration()); 549 try { 550 if (!workingDirFs.delete(workingDir, true)) { 551 LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" 552 + ClientSnapshotDescriptionUtils.toString(snapshot)); 553 } 554 } catch (IOException e1) { 555 LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" 556 + ClientSnapshotDescriptionUtils.toString(snapshot)); 557 } 558 // fail the snapshot 559 throw new SnapshotCreationException("Could not build snapshot handler", e, 560 ProtobufUtil.createSnapshotDesc(snapshot)); 561 } 562 } 563 564 public ReadWriteLock getTakingSnapshotLock() { 565 return this.takingSnapshotLock; 566 } 567 568 /** 569 * The snapshot operation processing as following: <br> 570 * 1. Create a Snapshot Handler, and do some initialization; <br> 571 * 2. Put the handler into snapshotHandlers <br> 572 * So when we consider if any snapshot is taking, we should consider both the takingSnapshotLock 573 * and snapshotHandlers; 574 * @return true to indicate that there're some running snapshots. 575 */ 576 public synchronized boolean isTakingAnySnapshot() { 577 return this.takingSnapshotLock.getReadHoldCount() > 0 || this.snapshotHandlers.size() > 0; 578 } 579 580 /** 581 * Take a snapshot based on the enabled/disabled state of the table. n * @throws 582 * HBaseSnapshotException when a snapshot specific exception occurs. 583 * @throws IOException when some sort of generic IO exception occurs. 584 */ 585 public void takeSnapshot(SnapshotDescription snapshot) throws IOException { 586 this.takingSnapshotLock.readLock().lock(); 587 try { 588 takeSnapshotInternal(snapshot); 589 } finally { 590 this.takingSnapshotLock.readLock().unlock(); 591 } 592 } 593 594 private void takeSnapshotInternal(SnapshotDescription snapshot) throws IOException { 595 // check to see if we already completed the snapshot 596 if (isSnapshotCompleted(snapshot)) { 597 throw new SnapshotExistsException( 598 "Snapshot '" + snapshot.getName() + "' already stored on the filesystem.", 599 ProtobufUtil.createSnapshotDesc(snapshot)); 600 } 601 602 LOG.debug("No existing snapshot, attempting snapshot..."); 603 604 // stop tracking "abandoned" handlers 605 cleanupSentinels(); 606 607 // check to see if the table exists 608 TableDescriptor desc = null; 609 try { 610 desc = master.getTableDescriptors().get(TableName.valueOf(snapshot.getTable())); 611 } catch (FileNotFoundException e) { 612 String msg = "Table:" + snapshot.getTable() + " info doesn't exist!"; 613 LOG.error(msg); 614 throw new SnapshotCreationException(msg, e, ProtobufUtil.createSnapshotDesc(snapshot)); 615 } catch (IOException e) { 616 throw new SnapshotCreationException( 617 "Error while geting table description for table " + snapshot.getTable(), e, 618 ProtobufUtil.createSnapshotDesc(snapshot)); 619 } 620 if (desc == null) { 621 throw new SnapshotCreationException( 622 "Table '" + snapshot.getTable() + "' doesn't exist, can't take snapshot.", 623 ProtobufUtil.createSnapshotDesc(snapshot)); 624 } 625 SnapshotDescription.Builder builder = snapshot.toBuilder(); 626 // if not specified, set the snapshot format 627 if (!snapshot.hasVersion()) { 628 builder.setVersion(SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_VERSION); 629 } 630 RpcServer.getRequestUser().ifPresent(user -> { 631 if (AccessChecker.isAuthorizationSupported(master.getConfiguration())) { 632 builder.setOwner(user.getShortName()); 633 } 634 }); 635 snapshot = builder.build(); 636 637 // call pre coproc hook 638 MasterCoprocessorHost cpHost = master.getMasterCoprocessorHost(); 639 org.apache.hadoop.hbase.client.SnapshotDescription snapshotPOJO = null; 640 if (cpHost != null) { 641 snapshotPOJO = ProtobufUtil.createSnapshotDesc(snapshot); 642 cpHost.preSnapshot(snapshotPOJO, desc); 643 } 644 645 // if the table is enabled, then have the RS run actually the snapshot work 646 TableName snapshotTable = TableName.valueOf(snapshot.getTable()); 647 if (master.getTableStateManager().isTableState(snapshotTable, TableState.State.ENABLED)) { 648 if (LOG.isDebugEnabled()) { 649 LOG.debug("Table enabled, starting distributed snapshots for {}", 650 ClientSnapshotDescriptionUtils.toString(snapshot)); 651 } 652 snapshotEnabledTable(snapshot); 653 if (LOG.isDebugEnabled()) { 654 LOG.debug("Started snapshot: {}", ClientSnapshotDescriptionUtils.toString(snapshot)); 655 } 656 } 657 // For disabled table, snapshot is created by the master 658 else if (master.getTableStateManager().isTableState(snapshotTable, TableState.State.DISABLED)) { 659 if (LOG.isDebugEnabled()) { 660 LOG.debug("Table is disabled, running snapshot entirely on master for {}", 661 ClientSnapshotDescriptionUtils.toString(snapshot)); 662 } 663 snapshotDisabledTable(snapshot); 664 if (LOG.isDebugEnabled()) { 665 LOG.debug("Started snapshot: {}", ClientSnapshotDescriptionUtils.toString(snapshot)); 666 } 667 } else { 668 LOG.error("Can't snapshot table '" + snapshot.getTable() 669 + "', isn't open or closed, we don't know what to do!"); 670 TablePartiallyOpenException tpoe = 671 new TablePartiallyOpenException(snapshot.getTable() + " isn't fully open."); 672 throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, 673 ProtobufUtil.createSnapshotDesc(snapshot)); 674 } 675 676 // call post coproc hook 677 if (cpHost != null) { 678 cpHost.postSnapshot(snapshotPOJO, desc); 679 } 680 } 681 682 /** 683 * Set the handler for the current snapshot 684 * <p> 685 * Exposed for TESTING n * @param handler handler the master should use TODO get rid of this if 686 * possible, repackaging, modify tests. 687 */ 688 public synchronized void setSnapshotHandlerForTesting(final TableName tableName, 689 final SnapshotSentinel handler) { 690 if (handler != null) { 691 this.snapshotHandlers.put(tableName, handler); 692 } else { 693 this.snapshotHandlers.remove(tableName); 694 } 695 } 696 697 /** Returns distributed commit coordinator for all running snapshots */ 698 ProcedureCoordinator getCoordinator() { 699 return coordinator; 700 } 701 702 /** 703 * Check to see if the snapshot is one of the currently completed snapshots Returns true if the 704 * snapshot exists in the "completed snapshots folder". 705 * @param snapshot expected snapshot to check 706 * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is 707 * not stored 708 * @throws IOException if the filesystem throws an unexpected exception, 709 * @throws IllegalArgumentException if snapshot name is invalid. 710 */ 711 private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException { 712 try { 713 final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir); 714 FileSystem fs = master.getMasterFileSystem().getFileSystem(); 715 // check to see if the snapshot already exists 716 return fs.exists(snapshotDir); 717 } catch (IllegalArgumentException iae) { 718 throw new UnknownSnapshotException("Unexpected exception thrown", iae); 719 } 720 } 721 722 /** 723 * Clone the specified snapshot. The clone will fail if the destination table has a snapshot or 724 * restore in progress. 725 * @param reqSnapshot Snapshot Descriptor from request 726 * @param tableName table to clone 727 * @param snapshot Snapshot Descriptor 728 * @param snapshotTableDesc Table Descriptor 729 * @param nonceKey unique identifier to prevent duplicated RPC 730 * @return procId the ID of the clone snapshot procedure n 731 */ 732 private long cloneSnapshot(final SnapshotDescription reqSnapshot, final TableName tableName, 733 final SnapshotDescription snapshot, final TableDescriptor snapshotTableDesc, 734 final NonceKey nonceKey, final boolean restoreAcl, final String customSFT) throws IOException { 735 MasterCoprocessorHost cpHost = master.getMasterCoprocessorHost(); 736 TableDescriptor htd = TableDescriptorBuilder.copy(tableName, snapshotTableDesc); 737 org.apache.hadoop.hbase.client.SnapshotDescription snapshotPOJO = null; 738 if (cpHost != null) { 739 snapshotPOJO = ProtobufUtil.createSnapshotDesc(snapshot); 740 cpHost.preCloneSnapshot(snapshotPOJO, htd); 741 } 742 long procId; 743 try { 744 procId = cloneSnapshot(snapshot, htd, nonceKey, restoreAcl, customSFT); 745 } catch (IOException e) { 746 LOG.error("Exception occurred while cloning the snapshot " + snapshot.getName() + " as table " 747 + tableName.getNameAsString(), e); 748 throw e; 749 } 750 LOG.info("Clone snapshot=" + snapshot.getName() + " as table=" + tableName); 751 752 if (cpHost != null) { 753 cpHost.postCloneSnapshot(snapshotPOJO, htd); 754 } 755 return procId; 756 } 757 758 /** 759 * Clone the specified snapshot into a new table. The operation will fail if the destination table 760 * has a snapshot or restore in progress. 761 * @param snapshot Snapshot Descriptor 762 * @param tableDescriptor Table Descriptor of the table to create 763 * @param nonceKey unique identifier to prevent duplicated RPC 764 * @return procId the ID of the clone snapshot procedure 765 */ 766 synchronized long cloneSnapshot(final SnapshotDescription snapshot, 767 final TableDescriptor tableDescriptor, final NonceKey nonceKey, final boolean restoreAcl, 768 final String customSFT) throws HBaseSnapshotException { 769 TableName tableName = tableDescriptor.getTableName(); 770 771 // make sure we aren't running a snapshot on the same table 772 if (isTakingSnapshot(tableName)) { 773 throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName); 774 } 775 776 // make sure we aren't running a restore on the same table 777 if (isRestoringTable(tableName)) { 778 throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName); 779 } 780 781 try { 782 long procId = master.getMasterProcedureExecutor().submitProcedure( 783 new CloneSnapshotProcedure(master.getMasterProcedureExecutor().getEnvironment(), 784 tableDescriptor, snapshot, restoreAcl, customSFT), 785 nonceKey); 786 this.restoreTableToProcIdMap.put(tableName, procId); 787 return procId; 788 } catch (Exception e) { 789 String msg = "Couldn't clone the snapshot=" 790 + ClientSnapshotDescriptionUtils.toString(snapshot) + " on table=" + tableName; 791 LOG.error(msg, e); 792 throw new RestoreSnapshotException(msg, e); 793 } 794 } 795 796 /** 797 * Restore or Clone the specified snapshot n * @param nonceKey unique identifier to prevent 798 * duplicated RPC n 799 */ 800 public long restoreOrCloneSnapshot(final SnapshotDescription reqSnapshot, final NonceKey nonceKey, 801 final boolean restoreAcl, String customSFT) throws IOException { 802 FileSystem fs = master.getMasterFileSystem().getFileSystem(); 803 Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir); 804 805 // check if the snapshot exists 806 if (!fs.exists(snapshotDir)) { 807 LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist."); 808 throw new SnapshotDoesNotExistException(ProtobufUtil.createSnapshotDesc(reqSnapshot)); 809 } 810 811 // Get snapshot info from file system. The reqSnapshot is a "fake" snapshotInfo with 812 // just the snapshot "name" and table name to restore. It does not contains the "real" snapshot 813 // information. 814 SnapshotDescription snapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir); 815 SnapshotManifest manifest = 816 SnapshotManifest.open(master.getConfiguration(), fs, snapshotDir, snapshot); 817 TableDescriptor snapshotTableDesc = manifest.getTableDescriptor(); 818 TableName tableName = TableName.valueOf(reqSnapshot.getTable()); 819 820 // sanity check the new table descriptor 821 TableDescriptorChecker.sanityCheck(master.getConfiguration(), snapshotTableDesc); 822 823 // stop tracking "abandoned" handlers 824 cleanupSentinels(); 825 826 // Verify snapshot validity 827 SnapshotReferenceUtil.verifySnapshot(master.getConfiguration(), fs, manifest); 828 829 // Execute the restore/clone operation 830 long procId; 831 if (master.getTableDescriptors().exists(tableName)) { 832 procId = 833 restoreSnapshot(reqSnapshot, tableName, snapshot, snapshotTableDesc, nonceKey, restoreAcl); 834 } else { 835 procId = cloneSnapshot(reqSnapshot, tableName, snapshot, snapshotTableDesc, nonceKey, 836 restoreAcl, customSFT); 837 } 838 return procId; 839 } 840 841 /** 842 * Restore the specified snapshot. The restore will fail if the destination table has a snapshot 843 * or restore in progress. 844 * @param reqSnapshot Snapshot Descriptor from request 845 * @param tableName table to restore 846 * @param snapshot Snapshot Descriptor 847 * @param snapshotTableDesc Table Descriptor 848 * @param nonceKey unique identifier to prevent duplicated RPC 849 * @param restoreAcl true to restore acl of snapshot 850 * @return procId the ID of the restore snapshot procedure n 851 */ 852 private long restoreSnapshot(final SnapshotDescription reqSnapshot, final TableName tableName, 853 final SnapshotDescription snapshot, final TableDescriptor snapshotTableDesc, 854 final NonceKey nonceKey, final boolean restoreAcl) throws IOException { 855 MasterCoprocessorHost cpHost = master.getMasterCoprocessorHost(); 856 857 // have to check first if restoring the snapshot would break current SFT setup 858 StoreFileTrackerValidationUtils.validatePreRestoreSnapshot( 859 master.getTableDescriptors().get(tableName), snapshotTableDesc, master.getConfiguration()); 860 861 if ( 862 master.getTableStateManager().isTableState(TableName.valueOf(snapshot.getTable()), 863 TableState.State.ENABLED) 864 ) { 865 throw new UnsupportedOperationException("Table '" + TableName.valueOf(snapshot.getTable()) 866 + "' must be disabled in order to " + "perform a restore operation."); 867 } 868 869 // call Coprocessor pre hook 870 org.apache.hadoop.hbase.client.SnapshotDescription snapshotPOJO = null; 871 if (cpHost != null) { 872 snapshotPOJO = ProtobufUtil.createSnapshotDesc(snapshot); 873 cpHost.preRestoreSnapshot(snapshotPOJO, snapshotTableDesc); 874 } 875 876 long procId; 877 try { 878 procId = restoreSnapshot(snapshot, snapshotTableDesc, nonceKey, restoreAcl); 879 } catch (IOException e) { 880 LOG.error("Exception occurred while restoring the snapshot " + snapshot.getName() 881 + " as table " + tableName.getNameAsString(), e); 882 throw e; 883 } 884 LOG.info("Restore snapshot=" + snapshot.getName() + " as table=" + tableName); 885 886 if (cpHost != null) { 887 cpHost.postRestoreSnapshot(snapshotPOJO, snapshotTableDesc); 888 } 889 890 return procId; 891 } 892 893 /** 894 * Restore the specified snapshot. The restore will fail if the destination table has a snapshot 895 * or restore in progress. 896 * @param snapshot Snapshot Descriptor 897 * @param tableDescriptor Table Descriptor 898 * @param nonceKey unique identifier to prevent duplicated RPC 899 * @param restoreAcl true to restore acl of snapshot 900 * @return procId the ID of the restore snapshot procedure 901 */ 902 private synchronized long restoreSnapshot(final SnapshotDescription snapshot, 903 final TableDescriptor tableDescriptor, final NonceKey nonceKey, final boolean restoreAcl) 904 throws HBaseSnapshotException { 905 final TableName tableName = tableDescriptor.getTableName(); 906 907 // make sure we aren't running a snapshot on the same table 908 if (isTakingSnapshot(tableName)) { 909 throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName); 910 } 911 912 // make sure we aren't running a restore on the same table 913 if (isRestoringTable(tableName)) { 914 throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName); 915 } 916 917 try { 918 long procId = master.getMasterProcedureExecutor().submitProcedure( 919 new RestoreSnapshotProcedure(master.getMasterProcedureExecutor().getEnvironment(), 920 tableDescriptor, snapshot, restoreAcl), 921 nonceKey); 922 this.restoreTableToProcIdMap.put(tableName, procId); 923 return procId; 924 } catch (Exception e) { 925 String msg = "Couldn't restore the snapshot=" 926 + ClientSnapshotDescriptionUtils.toString(snapshot) + " on table=" + tableName; 927 LOG.error(msg, e); 928 throw new RestoreSnapshotException(msg, e); 929 } 930 } 931 932 /** 933 * Verify if the restore of the specified table is in progress. 934 * @param tableName table under restore 935 * @return <tt>true</tt> if there is a restore in progress of the specified table. 936 */ 937 private synchronized boolean isRestoringTable(final TableName tableName) { 938 Long procId = this.restoreTableToProcIdMap.get(tableName); 939 if (procId == null) { 940 return false; 941 } 942 ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor(); 943 if (procExec.isRunning() && !procExec.isFinished(procId)) { 944 return true; 945 } else { 946 this.restoreTableToProcIdMap.remove(tableName); 947 return false; 948 } 949 } 950 951 /** 952 * Return the handler if it is currently live and has the same snapshot target name. The handler 953 * is removed from the sentinels map if completed. 954 * @param sentinels live handlers 955 * @param snapshot snapshot description 956 * @return null if doesn't match, else a live handler. 957 */ 958 private synchronized SnapshotSentinel removeSentinelIfFinished( 959 final Map<TableName, SnapshotSentinel> sentinels, final SnapshotDescription snapshot) { 960 if (!snapshot.hasTable()) { 961 return null; 962 } 963 964 TableName snapshotTable = TableName.valueOf(snapshot.getTable()); 965 SnapshotSentinel h = sentinels.get(snapshotTable); 966 if (h == null) { 967 return null; 968 } 969 970 if (!h.getSnapshot().getName().equals(snapshot.getName())) { 971 // specified snapshot is to the one currently running 972 return null; 973 } 974 975 // Remove from the "in-progress" list once completed 976 if (h.isFinished()) { 977 sentinels.remove(snapshotTable); 978 } 979 980 return h; 981 } 982 983 /** 984 * Removes "abandoned" snapshot/restore requests. As part of the HBaseAdmin snapshot/restore API 985 * the operation status is checked until completed, and the in-progress maps are cleaned up when 986 * the status of a completed task is requested. To avoid having sentinels staying around for long 987 * time if something client side is failed, each operation tries to clean up the in-progress maps 988 * sentinels finished from a long time. 989 */ 990 private void cleanupSentinels() { 991 cleanupSentinels(this.snapshotHandlers); 992 cleanupCompletedRestoreInMap(); 993 } 994 995 /** 996 * Remove the sentinels that are marked as finished and the completion time has exceeded the 997 * removal timeout. 998 * @param sentinels map of sentinels to clean 999 */ 1000 private synchronized void cleanupSentinels(final Map<TableName, SnapshotSentinel> sentinels) { 1001 long currentTime = EnvironmentEdgeManager.currentTime(); 1002 long sentinelsCleanupTimeoutMillis = 1003 master.getConfiguration().getLong(HBASE_SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT_MILLIS, 1004 SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT_MILLS_DEFAULT); 1005 Iterator<Map.Entry<TableName, SnapshotSentinel>> it = sentinels.entrySet().iterator(); 1006 while (it.hasNext()) { 1007 Map.Entry<TableName, SnapshotSentinel> entry = it.next(); 1008 SnapshotSentinel sentinel = entry.getValue(); 1009 if ( 1010 sentinel.isFinished() 1011 && (currentTime - sentinel.getCompletionTimestamp()) > sentinelsCleanupTimeoutMillis 1012 ) { 1013 it.remove(); 1014 } 1015 } 1016 } 1017 1018 /** 1019 * Remove the procedures that are marked as finished 1020 */ 1021 private synchronized void cleanupCompletedRestoreInMap() { 1022 ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor(); 1023 Iterator<Map.Entry<TableName, Long>> it = restoreTableToProcIdMap.entrySet().iterator(); 1024 while (it.hasNext()) { 1025 Map.Entry<TableName, Long> entry = it.next(); 1026 Long procId = entry.getValue(); 1027 if (procExec.isRunning() && procExec.isFinished(procId)) { 1028 it.remove(); 1029 } 1030 } 1031 } 1032 1033 // 1034 // Implementing Stoppable interface 1035 // 1036 1037 @Override 1038 public void stop(String why) { 1039 // short circuit 1040 if (this.stopped) return; 1041 // make sure we get stop 1042 this.stopped = true; 1043 // pass the stop onto take snapshot handlers 1044 for (SnapshotSentinel snapshotHandler : this.snapshotHandlers.values()) { 1045 snapshotHandler.cancel(why); 1046 } 1047 if (snapshotHandlerChoreCleanerTask != null) { 1048 snapshotHandlerChoreCleanerTask.cancel(true); 1049 } 1050 try { 1051 if (coordinator != null) { 1052 coordinator.close(); 1053 } 1054 } catch (IOException e) { 1055 LOG.error("stop ProcedureCoordinator error", e); 1056 } 1057 } 1058 1059 @Override 1060 public boolean isStopped() { 1061 return this.stopped; 1062 } 1063 1064 /** 1065 * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported. 1066 * Called at the beginning of snapshot() and restoreSnapshot() methods. 1067 * @throws UnsupportedOperationException if snapshot are not supported 1068 */ 1069 public void checkSnapshotSupport() throws UnsupportedOperationException { 1070 if (!this.isSnapshotSupported) { 1071 throw new UnsupportedOperationException( 1072 "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" 1073 + HBASE_SNAPSHOT_ENABLED + "' property with value 'true'."); 1074 } 1075 } 1076 1077 /** 1078 * Called at startup, to verify if snapshot operation is supported, and to avoid starting the 1079 * master if there're snapshots present but the cleaners needed are missing. Otherwise we can end 1080 * up with snapshot data loss. 1081 * @param conf The {@link Configuration} object to use 1082 * @param mfs The MasterFileSystem to use 1083 * @throws IOException in case of file-system operation failure 1084 * @throws UnsupportedOperationException in case cleaners are missing and there're snapshot in the 1085 * system 1086 */ 1087 private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs) 1088 throws IOException, UnsupportedOperationException { 1089 // Verify if snapshot is disabled by the user 1090 String enabled = conf.get(HBASE_SNAPSHOT_ENABLED); 1091 boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false); 1092 boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled); 1093 1094 // Extract cleaners from conf 1095 Set<String> hfileCleaners = new HashSet<>(); 1096 String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS); 1097 if (cleaners != null) Collections.addAll(hfileCleaners, cleaners); 1098 1099 Set<String> logCleaners = new HashSet<>(); 1100 cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS); 1101 if (cleaners != null) Collections.addAll(logCleaners, cleaners); 1102 1103 // check if an older version of snapshot directory was present 1104 Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME); 1105 FileSystem fs = mfs.getFileSystem(); 1106 List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir), false); 1107 if (ss != null && !ss.isEmpty()) { 1108 LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir); 1109 LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME); 1110 } 1111 1112 // If the user has enabled the snapshot, we force the cleaners to be present 1113 // otherwise we still need to check if cleaners are enabled or not and verify 1114 // that there're no snapshot in the .snapshot folder. 1115 if (snapshotEnabled) { 1116 // Inject snapshot cleaners, if snapshot.enable is true 1117 hfileCleaners.add(SnapshotHFileCleaner.class.getName()); 1118 hfileCleaners.add(HFileLinkCleaner.class.getName()); 1119 // If sync acl to HDFS feature is enabled, then inject the cleaner 1120 if (SnapshotScannerHDFSAclHelper.isAclSyncToHdfsEnabled(conf)) { 1121 hfileCleaners.add(SnapshotScannerHDFSAclCleaner.class.getName()); 1122 } 1123 1124 // Set cleaners conf 1125 conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS, 1126 hfileCleaners.toArray(new String[hfileCleaners.size()])); 1127 conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS, 1128 logCleaners.toArray(new String[logCleaners.size()])); 1129 } else { 1130 // There may be restore tables if snapshot is enabled and then disabled, so add 1131 // HFileLinkCleaner, see HBASE-26670 for more details. 1132 hfileCleaners.add(HFileLinkCleaner.class.getName()); 1133 conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS, 1134 hfileCleaners.toArray(new String[hfileCleaners.size()])); 1135 // Verify if SnapshotHFileCleaner are present 1136 snapshotEnabled = hfileCleaners.contains(SnapshotHFileCleaner.class.getName()); 1137 1138 // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set. 1139 if (snapshotEnabled) { 1140 LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " + "but the '" 1141 + HBASE_SNAPSHOT_ENABLED + "' property " 1142 + (userDisabled ? "is set to 'false'." : "is not set.")); 1143 } 1144 } 1145 1146 // Mark snapshot feature as enabled if cleaners are present and user has not disabled it. 1147 this.isSnapshotSupported = snapshotEnabled && !userDisabled; 1148 1149 // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder 1150 // otherwise we end up with snapshot data loss. 1151 if (!snapshotEnabled) { 1152 LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners."); 1153 Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir()); 1154 if (fs.exists(snapshotDir)) { 1155 FileStatus[] snapshots = CommonFSUtils.listStatus(fs, snapshotDir, 1156 new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs)); 1157 if (snapshots != null) { 1158 LOG.error("Snapshots are present, but cleaners are not enabled."); 1159 checkSnapshotSupport(); 1160 } 1161 } 1162 } 1163 } 1164 1165 @Override 1166 public void initialize(MasterServices master, MetricsMaster metricsMaster) 1167 throws KeeperException, IOException, UnsupportedOperationException { 1168 this.master = master; 1169 1170 this.rootDir = master.getMasterFileSystem().getRootDir(); 1171 checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem()); 1172 1173 // get the configuration for the coordinator 1174 Configuration conf = master.getConfiguration(); 1175 long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT); 1176 long timeoutMillis = Math.max( 1177 conf.getLong(SnapshotDescriptionUtils.SNAPSHOT_TIMEOUT_MILLIS_KEY, 1178 SnapshotDescriptionUtils.SNAPSHOT_TIMEOUT_MILLIS_DEFAULT), 1179 conf.getLong(SnapshotDescriptionUtils.MASTER_SNAPSHOT_TIMEOUT_MILLIS, 1180 SnapshotDescriptionUtils.DEFAULT_MAX_WAIT_TIME)); 1181 int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT); 1182 1183 // setup the default procedure coordinator 1184 String name = master.getServerName().toString(); 1185 ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads); 1186 ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinator(master.getZooKeeper(), 1187 SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name); 1188 1189 this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency); 1190 this.executorService = master.getExecutorService(); 1191 resetTempDir(); 1192 snapshotHandlerChoreCleanerTask = 1193 scheduleThreadPool.scheduleAtFixedRate(this::cleanupSentinels, 10, 10, TimeUnit.SECONDS); 1194 } 1195 1196 @Override 1197 public String getProcedureSignature() { 1198 return ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION; 1199 } 1200 1201 @Override 1202 public void execProcedure(ProcedureDescription desc) throws IOException { 1203 takeSnapshot(toSnapshotDescription(desc)); 1204 } 1205 1206 @Override 1207 public void checkPermissions(ProcedureDescription desc, AccessChecker accessChecker, User user) 1208 throws IOException { 1209 // Done by AccessController as part of preSnapshot coprocessor hook (legacy code path). 1210 // In future, when we AC is removed for good, that check should be moved here. 1211 } 1212 1213 @Override 1214 public boolean isProcedureDone(ProcedureDescription desc) throws IOException { 1215 return isSnapshotDone(toSnapshotDescription(desc)); 1216 } 1217 1218 private SnapshotDescription toSnapshotDescription(ProcedureDescription desc) throws IOException { 1219 SnapshotDescription.Builder builder = SnapshotDescription.newBuilder(); 1220 if (!desc.hasInstance()) { 1221 throw new IOException("Snapshot name is not defined: " + desc.toString()); 1222 } 1223 String snapshotName = desc.getInstance(); 1224 List<NameStringPair> props = desc.getConfigurationList(); 1225 String table = null; 1226 for (NameStringPair prop : props) { 1227 if ("table".equalsIgnoreCase(prop.getName())) { 1228 table = prop.getValue(); 1229 } 1230 } 1231 if (table == null) { 1232 throw new IOException("Snapshot table is not defined: " + desc.toString()); 1233 } 1234 TableName tableName = TableName.valueOf(table); 1235 builder.setTable(tableName.getNameAsString()); 1236 builder.setName(snapshotName); 1237 builder.setType(SnapshotDescription.Type.FLUSH); 1238 return builder.build(); 1239 } 1240}