View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.snapshot;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.concurrent.ThreadPoolExecutor;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.classification.InterfaceStability;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.HConstants;
42  import org.apache.hadoop.hbase.HTableDescriptor;
43  import org.apache.hadoop.hbase.Stoppable;
44  import org.apache.hadoop.hbase.catalog.MetaReader;
45  import org.apache.hadoop.hbase.errorhandling.ForeignException;
46  import org.apache.hadoop.hbase.executor.ExecutorService;
47  import org.apache.hadoop.hbase.master.AssignmentManager;
48  import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
49  import org.apache.hadoop.hbase.master.MasterFileSystem;
50  import org.apache.hadoop.hbase.master.MasterServices;
51  import org.apache.hadoop.hbase.master.SnapshotSentinel;
52  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
53  import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
54  import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
55  import org.apache.hadoop.hbase.procedure.Procedure;
56  import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
57  import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs;
58  import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinatorRpcs;
59  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
60  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription.Type;
61  import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
62  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException;
63  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
64  import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
65  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
66  import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException;
67  import org.apache.hadoop.hbase.snapshot.SnapshotExistsException;
68  import org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil;
69  import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException;
70  import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException;
71  import org.apache.hadoop.hbase.util.Bytes;
72  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
73  import org.apache.hadoop.hbase.util.FSTableDescriptors;
74  import org.apache.hadoop.hbase.util.FSUtils;
75  import org.apache.zookeeper.KeeperException;
76  
77  /**
78   * This class manages the procedure of taking and restoring snapshots. There is only one
79   * SnapshotManager for the master.
80   * <p>
81   * The class provides methods for monitoring in-progress snapshot actions.
82   * <p>
83   * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a
84   * simplification in the current implementation.
85   */
86  @InterfaceAudience.Private
87  @InterfaceStability.Unstable
88  public class SnapshotManager implements Stoppable {
89    private static final Log LOG = LogFactory.getLog(SnapshotManager.class);
90  
91    /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */
92    private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500;
93  
94    /**
95     * Wait time before removing a finished sentinel from the in-progress map
96     *
97     * NOTE: This is used as a safety auto cleanup.
98     * The snapshot and restore handlers map entries are removed when a user asks if a snapshot or
99     * restore is completed. This operation is part of the HBaseAdmin snapshot/restore API flow.
100    * In case something fails on the client side and the snapshot/restore state is not reclaimed
101    * after a default timeout, the entry is removed from the in-progress map.
102    * At this point, if the user asks for the snapshot/restore status, the result will be
103    * snapshot done if exists or failed if it doesn't exists.
104    */
105   private static final int SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT = 60 * 1000;
106 
107   /** Enable or disable snapshot support */
108   public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled";
109 
110   /**
111    * Conf key for # of ms elapsed between checks for snapshot errors while waiting for
112    * completion.
113    */
114   private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis";
115 
116   /** By default, check to see if the snapshot is complete (ms) */
117   private static final int SNAPSHOT_TIMEOUT_MILLIS_DEFAULT = 60000;
118 
119   /**
120    * Conf key for # of ms elapsed before injecting a snapshot timeout error when waiting for
121    * completion.
122    */
123   private static final String SNAPSHOT_TIMEOUT_MILLIS_KEY = "hbase.snapshot.master.timeoutMillis";
124 
125   /** Name of the operation to use in the controller */
126   public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot";
127 
128   /** Conf key for # of threads used by the SnapshotManager thread pool */
129   private static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads";
130 
131   /** number of current operations running on the master */
132   private static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1;
133 
134   private boolean stopped;
135   private final MasterServices master;  // Needed by TableEventHandlers
136   private final MasterMetrics metricsMaster;
137   private final ProcedureCoordinator coordinator;
138 
139   // Is snapshot feature enabled?
140   private boolean isSnapshotSupported = false;
141 
142   // Snapshot handlers map, with table name as key.
143   // The map is always accessed and modified under the object lock using synchronized.
144   // snapshotTable() will insert an Handler in the table.
145   // isSnapshotDone() will remove the handler requested if the operation is finished.
146   private Map<String, SnapshotSentinel> snapshotHandlers = new HashMap<String, SnapshotSentinel>();
147 
148   // Restore Sentinels map, with table name as key.
149   // The map is always accessed and modified under the object lock using synchronized.
150   // restoreSnapshot()/cloneSnapshot() will insert an Handler in the table.
151   // isRestoreDone() will remove the handler requested if the operation is finished.
152   private Map<String, SnapshotSentinel> restoreHandlers = new HashMap<String, SnapshotSentinel>();
153 
154   private final Path rootDir;
155   private final ExecutorService executorService;
156 
157   /**
158    * Construct a snapshot manager.
159    * @param master
160    */
161   public SnapshotManager(final MasterServices master, final MasterMetrics metricsMaster)
162       throws KeeperException, IOException, UnsupportedOperationException {
163     this.master = master;
164     this.metricsMaster = metricsMaster;
165 
166     this.rootDir = master.getMasterFileSystem().getRootDir();
167     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
168 
169     // get the configuration for the coordinator
170     Configuration conf = master.getConfiguration();
171     long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT);
172     long timeoutMillis = conf.getLong(SNAPSHOT_TIMEOUT_MILLIS_KEY, SNAPSHOT_TIMEOUT_MILLIS_DEFAULT);
173     int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT);
174 
175     // setup the default procedure coordinator
176     String name = master.getServerName().toString();
177     ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads);
178     ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinatorRpcs(
179         master.getZooKeeper(), SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name);
180     this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency);
181     this.executorService = master.getExecutorService();
182     resetTempDir();
183   }
184 
185   /**
186    * Fully specify all necessary components of a snapshot manager. Exposed for testing.
187    * @param master services for the master where the manager is running
188    * @param coordinator procedure coordinator instance.  exposed for testing.
189    * @param pool HBase ExecutorServcie instance, exposed for testing.
190    */
191   public SnapshotManager(final MasterServices master, final MasterMetrics metricsMaster,
192       ProcedureCoordinator coordinator, ExecutorService pool)
193       throws IOException, UnsupportedOperationException {
194     this.master = master;
195     this.metricsMaster = metricsMaster;
196 
197     this.rootDir = master.getMasterFileSystem().getRootDir();
198     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
199 
200     this.coordinator = coordinator;
201     this.executorService = pool;
202     resetTempDir();
203   }
204 
205   /**
206    * Gets the list of all completed snapshots.
207    * @return list of SnapshotDescriptions
208    * @throws IOException File system exception
209    */
210   public List<SnapshotDescription> getCompletedSnapshots() throws IOException {
211     return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir));
212   }
213 
214   /**
215    * Gets the list of all completed snapshots.
216    * @param snapshotDir snapshot directory
217    * @return list of SnapshotDescriptions
218    * @throws IOException File system exception
219    */
220   private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir) throws IOException {
221     List<SnapshotDescription> snapshotDescs = new ArrayList<SnapshotDescription>();
222     // first create the snapshot root path and check to see if it exists
223     FileSystem fs = master.getMasterFileSystem().getFileSystem();
224     if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir);
225 
226     // if there are no snapshots, return an empty list
227     if (!fs.exists(snapshotDir)) {
228       return snapshotDescs;
229     }
230 
231     // ignore all the snapshots in progress
232     FileStatus[] snapshots = fs.listStatus(snapshotDir,
233       new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
234     // loop through all the completed snapshots
235     for (FileStatus snapshot : snapshots) {
236       Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE);
237       // if the snapshot is bad
238       if (!fs.exists(info)) {
239         LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist");
240         continue;
241       }
242       FSDataInputStream in = null;
243       try {
244         in = fs.open(info);
245         SnapshotDescription desc = SnapshotDescription.parseFrom(in);
246         snapshotDescs.add(desc);
247       } catch (IOException e) {
248         LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e);
249       } finally {
250         if (in != null) {
251           in.close();
252         }
253       }
254     }
255     return snapshotDescs;
256   }
257 
258   /**
259    * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed
260    * snapshot attempts.
261    *
262    * @throws IOException if we can't reach the filesystem
263    */
264   void resetTempDir() throws IOException {
265     // cleanup any existing snapshots.
266     Path tmpdir = SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir);
267     if (master.getMasterFileSystem().getFileSystem().exists(tmpdir)) {
268       if (!master.getMasterFileSystem().getFileSystem().delete(tmpdir, true)) {
269         LOG.warn("Couldn't delete working snapshot directory: " + tmpdir);
270       }
271     }
272   }
273 
274   /**
275    * Delete the specified snapshot
276    * @param snapshot
277    * @throws SnapshotDoesNotExistException If the specified snapshot does not exist.
278    * @throws IOException For filesystem IOExceptions
279    */
280   public void deleteSnapshot(SnapshotDescription snapshot) throws SnapshotDoesNotExistException, IOException {
281 
282     // call coproc pre hook
283     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
284     if (cpHost != null) {
285       cpHost.preDeleteSnapshot(snapshot);
286     }
287 
288     // check to see if it is completed
289     if (!isSnapshotCompleted(snapshot)) {
290       throw new SnapshotDoesNotExistException(snapshot);
291     }
292 
293     String snapshotName = snapshot.getName();
294     LOG.debug("Deleting snapshot: " + snapshotName);
295     // first create the snapshot description and check to see if it exists
296     MasterFileSystem fs = master.getMasterFileSystem();
297     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
298 
299     // delete the existing snapshot
300     if (!fs.getFileSystem().delete(snapshotDir, true)) {
301       throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir);
302     }
303 
304     // call coproc post hook
305     if (cpHost != null) {
306       cpHost.postDeleteSnapshot(snapshot);
307     }
308 
309   }
310 
311   /**
312    * Check if the specified snapshot is done
313    *
314    * @param expected
315    * @return true if snapshot is ready to be restored, false if it is still being taken.
316    * @throws IOException IOException if error from HDFS or RPC
317    * @throws UnknownSnapshotException if snapshot is invalid or does not exist.
318    */
319   public boolean isSnapshotDone(SnapshotDescription expected) throws IOException {
320     // check the request to make sure it has a snapshot
321     if (expected == null) {
322       throw new UnknownSnapshotException(
323          "No snapshot name passed in request, can't figure out which snapshot you want to check.");
324     }
325 
326     String ssString = SnapshotDescriptionUtils.toString(expected);
327 
328     // check to see if the sentinel exists,
329     // and if the task is complete removes it from the in-progress snapshots map.
330     SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected);
331 
332     // stop tracking "abandoned" handlers
333     cleanupSentinels();
334 
335     if (handler == null) {
336       // If there's no handler in the in-progress map, it means one of the following:
337       //   - someone has already requested the snapshot state
338       //   - the requested snapshot was completed long time ago (cleanupSentinels() timeout)
339       //   - the snapshot was never requested
340       // In those cases returns to the user the "done state" if the snapshots exists on disk,
341       // otherwise raise an exception saying that the snapshot is not running and doesn't exist.
342       if (!isSnapshotCompleted(expected)) {
343         throw new UnknownSnapshotException("Snapshot " + ssString
344             + " is not currently running or one of the known completed snapshots.");
345       }
346       // was done, return true;
347       return true;
348     }
349 
350     // pass on any failure we find in the sentinel
351     try {
352       handler.rethrowExceptionIfFailed();
353     } catch (ForeignException e) {
354       // Give some procedure info on an exception.
355       String status;
356       Procedure p = coordinator.getProcedure(expected.getName());
357       if (p != null) {
358         status = p.getStatus();
359       } else {
360         status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames();
361       }
362       throw new HBaseSnapshotException("Snapshot " + ssString +  " had an error.  " + status, e,
363           expected);
364     }
365 
366     // check to see if we are done
367     if (handler.isFinished()) {
368       LOG.debug("Snapshot '" + ssString + "' has completed, notifying client.");
369       return true;
370     } else if (LOG.isDebugEnabled()) {
371       LOG.debug("Snapshoting '" + ssString + "' is still in progress!");
372     }
373     return false;
374   }
375 
376   /**
377    * Check to see if there is a snapshot in progress with the same name or on the same table.
378    * Currently we have a limitation only allowing a single snapshot per table at a time. Also we
379    * don't allow snapshot with the same name.
380    * @param snapshot description of the snapshot being checked.
381    * @return <tt>true</tt> if there is a snapshot in progress with the same name or on the same
382    *         table.
383    */
384   synchronized boolean isTakingSnapshot(final SnapshotDescription snapshot) {
385     if (isTakingSnapshot(snapshot.getTable())) {
386       return true;
387     }
388     Iterator<Map.Entry<String, SnapshotSentinel>> it = this.snapshotHandlers.entrySet().iterator();
389     while (it.hasNext()) {
390       Map.Entry<String, SnapshotSentinel> entry = it.next();
391       SnapshotSentinel sentinel = entry.getValue();
392       if (snapshot.getName().equals(sentinel.getSnapshot().getName()) && !sentinel.isFinished()) {
393         return true;
394       }
395     }
396     return false;
397   }
398 
399   /**
400    * Check to see if the specified table has a snapshot in progress.  Currently we have a
401    * limitation only allowing a single snapshot per table at a time.
402    * @param tableName name of the table being snapshotted.
403    * @return <tt>true</tt> if there is a snapshot in progress on the specified table.
404    */
405   synchronized boolean isTakingSnapshot(final String tableName) {
406     SnapshotSentinel handler = this.snapshotHandlers.get(tableName);
407     return handler != null && !handler.isFinished();
408   }
409 
410   /**
411    * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we
412    * aren't already running a snapshot or restore on the requested table.
413    * @param snapshot description of the snapshot we want to start
414    * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot
415    */
416   private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot)
417       throws HBaseSnapshotException {
418     FileSystem fs = master.getMasterFileSystem().getFileSystem();
419     Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
420     // make sure we aren't already running a snapshot
421     if (isTakingSnapshot(snapshot)) {
422       SnapshotSentinel handler = this.snapshotHandlers.get(snapshot.getTable());
423       throw new SnapshotCreationException("Rejected taking "
424           + SnapshotDescriptionUtils.toString(snapshot)
425           + " because we are already running another snapshot "
426           + (handler != null ? ("on the same table " +
427               SnapshotDescriptionUtils.toString(handler.getSnapshot()))
428               : "with the same name"), snapshot);
429     }
430 
431     // make sure we aren't running a restore on the same table
432     if (isRestoringTable(snapshot.getTable())) {
433       SnapshotSentinel handler = restoreHandlers.get(snapshot.getTable());
434       throw new SnapshotCreationException("Rejected taking "
435           + SnapshotDescriptionUtils.toString(snapshot)
436           + " because we are already have a restore in progress on the same snapshot "
437           + SnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
438     }
439 
440     try {
441       // delete the working directory, since we aren't running the snapshot. Likely leftovers
442       // from a failed attempt.
443       fs.delete(workingDir, true);
444 
445       // recreate the working directory for the snapshot
446       if (!fs.mkdirs(workingDir)) {
447         throw new SnapshotCreationException("Couldn't create working directory (" + workingDir
448             + ") for snapshot" , snapshot);
449       }
450     } catch (HBaseSnapshotException e) {
451       throw e;
452     } catch (IOException e) {
453       throw new SnapshotCreationException(
454           "Exception while checking to see if snapshot could be started.", e, snapshot);
455     }
456   }
457 
458   /**
459    * Take a snapshot of a disabled table.
460    * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}.
461    * @throws HBaseSnapshotException if the snapshot could not be started
462    */
463   private synchronized void snapshotDisabledTable(SnapshotDescription snapshot)
464       throws HBaseSnapshotException {
465     // setup the snapshot
466     prepareToTakeSnapshot(snapshot);
467 
468     // set the snapshot to be a disabled snapshot, since the client doesn't know about that
469     snapshot = snapshot.toBuilder().setType(Type.DISABLED).build();
470 
471     // Take the snapshot of the disabled table
472     DisabledTableSnapshotHandler handler =
473         new DisabledTableSnapshotHandler(snapshot, master, metricsMaster);
474     snapshotTable(snapshot, handler);
475   }
476 
477   /**
478    * Take a snapshot of an enabled table.
479    * @param snapshot description of the snapshot to take.
480    * @throws HBaseSnapshotException if the snapshot could not be started
481    */
482   private synchronized void snapshotEnabledTable(SnapshotDescription snapshot)
483       throws HBaseSnapshotException {
484     // setup the snapshot
485     prepareToTakeSnapshot(snapshot);
486 
487     // Take the snapshot of the enabled table
488     EnabledTableSnapshotHandler handler =
489         new EnabledTableSnapshotHandler(snapshot, master, this, metricsMaster);
490     snapshotTable(snapshot, handler);
491   }
492 
493   /**
494    * Take a snapshot using the specified handler.
495    * On failure the snapshot temporary working directory is removed.
496    * NOTE: prepareToTakeSnapshot() called before this one takes care of the rejecting the
497    *       snapshot request if the table is busy with another snapshot/restore operation.
498    * @param snapshot the snapshot description
499    * @param handler the snapshot handler
500    */
501   private synchronized void snapshotTable(SnapshotDescription snapshot,
502       final TakeSnapshotHandler handler) throws HBaseSnapshotException {
503     try {
504       handler.prepare();
505       this.executorService.submit(handler);
506       this.snapshotHandlers.put(snapshot.getTable(), handler);
507     } catch (Exception e) {
508       // cleanup the working directory by trying to delete it from the fs.
509       Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
510       try {
511         if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) {
512           LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
513               SnapshotDescriptionUtils.toString(snapshot));
514         }
515       } catch (IOException e1) {
516         LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
517             SnapshotDescriptionUtils.toString(snapshot));
518       }
519       // fail the snapshot
520       throw new SnapshotCreationException("Could not build snapshot handler", e, snapshot);
521     }
522   }
523 
524   /**
525    * Take a snapshot based on the enabled/disabled state of the table.
526    *
527    * @param snapshot
528    * @throws HBaseSnapshotException when a snapshot specific exception occurs.
529    * @throws IOException when some sort of generic IO exception occurs.
530    */
531   public void takeSnapshot(SnapshotDescription snapshot) throws IOException {
532     // check to see if we already completed the snapshot
533     if (isSnapshotCompleted(snapshot)) {
534       throw new SnapshotExistsException("Snapshot '" + snapshot.getName()
535           + "' already stored on the filesystem.", snapshot);
536     }
537 
538     LOG.debug("No existing snapshot, attempting snapshot...");
539 
540     // stop tracking "abandoned" handlers
541     cleanupSentinels();
542 
543     // check to see if the table exists
544     HTableDescriptor desc = null;
545     try {
546       desc = master.getTableDescriptors().get(snapshot.getTable());
547     } catch (FileNotFoundException e) {
548       String msg = "Table:" + snapshot.getTable() + " info doesn't exist!";
549       LOG.error(msg);
550       throw new SnapshotCreationException(msg, e, snapshot);
551     } catch (IOException e) {
552       throw new SnapshotCreationException("Error while geting table description for table "
553           + snapshot.getTable(), e, snapshot);
554     }
555     if (desc == null) {
556       throw new SnapshotCreationException("Table '" + snapshot.getTable()
557           + "' doesn't exist, can't take snapshot.", snapshot);
558     }
559 
560     // set the snapshot version, now that we are ready to take it
561     snapshot = snapshot.toBuilder().setVersion(SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_VERSION)
562         .build();
563 
564     // call pre coproc hook
565     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
566     if (cpHost != null) {
567       cpHost.preSnapshot(snapshot, desc);
568     }
569 
570     // if the table is enabled, then have the RS run actually the snapshot work
571     AssignmentManager assignmentMgr = master.getAssignmentManager();
572     if (assignmentMgr.getZKTable().isEnabledTable(snapshot.getTable())) {
573       LOG.debug("Table enabled, starting distributed snapshot.");
574       snapshotEnabledTable(snapshot);
575       LOG.debug("Started snapshot: " + SnapshotDescriptionUtils.toString(snapshot));
576     }
577     // For disabled table, snapshot is created by the master
578     else if (assignmentMgr.getZKTable().isDisabledTable(snapshot.getTable())) {
579       LOG.debug("Table is disabled, running snapshot entirely on master.");
580       snapshotDisabledTable(snapshot);
581       LOG.debug("Started snapshot: " + SnapshotDescriptionUtils.toString(snapshot));
582     } else {
583       LOG.error("Can't snapshot table '" + snapshot.getTable()
584           + "', isn't open or closed, we don't know what to do!");
585       TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable()
586           + " isn't fully open.");
587       throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, snapshot);
588     }
589 
590     // call post coproc hook
591     if (cpHost != null) {
592       cpHost.postSnapshot(snapshot, desc);
593     }
594   }
595 
596   /**
597    * Set the handler for the current snapshot
598    * <p>
599    * Exposed for TESTING
600    * @param tableName
601    * @param handler handler the master should use
602    *
603    * TODO get rid of this if possible, repackaging, modify tests.
604    */
605   public synchronized void setSnapshotHandlerForTesting(final String tableName,
606       final SnapshotSentinel handler) {
607     if (handler != null) {
608       this.snapshotHandlers.put(tableName, handler);
609     } else {
610       this.snapshotHandlers.remove(tableName);
611     }
612   }
613 
614   /**
615    * @return distributed commit coordinator for all running snapshots
616    */
617   ProcedureCoordinator getCoordinator() {
618     return coordinator;
619   }
620 
621   /**
622    * Check to see if the snapshot is one of the currently completed snapshots
623    * Returns true if the snapshot exists in the "completed snapshots folder".
624    *
625    * @param snapshot expected snapshot to check
626    * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is
627    *         not stored
628    * @throws IOException if the filesystem throws an unexpected exception,
629    * @throws IllegalArgumentException if snapshot name is invalid.
630    */
631   private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException {
632     try {
633       final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
634       FileSystem fs = master.getMasterFileSystem().getFileSystem();
635 
636       // check to see if the snapshot already exists
637       return fs.exists(snapshotDir);
638     } catch (IllegalArgumentException iae) {
639       throw new UnknownSnapshotException("Unexpected exception thrown", iae);
640     }
641   }
642 
643   /**
644    * Clone the specified snapshot into a new table.
645    * The operation will fail if the destination table has a snapshot or restore in progress.
646    *
647    * @param snapshot Snapshot Descriptor
648    * @param hTableDescriptor Table Descriptor of the table to create
649    */
650   synchronized void cloneSnapshot(final SnapshotDescription snapshot,
651       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
652     String tableName = hTableDescriptor.getNameAsString();
653 
654     // make sure we aren't running a snapshot on the same table
655     if (isTakingSnapshot(tableName)) {
656       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
657     }
658 
659     // make sure we aren't running a restore on the same table
660     if (isRestoringTable(tableName)) {
661       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
662     }
663 
664     try {
665       CloneSnapshotHandler handler =
666         new CloneSnapshotHandler(master, snapshot, hTableDescriptor, metricsMaster);
667       this.executorService.submit(handler);
668       this.restoreHandlers.put(tableName, handler);
669     } catch (Exception e) {
670       String msg = "Couldn't clone the snapshot=" + SnapshotDescriptionUtils.toString(snapshot) +
671         " on table=" + tableName;
672       LOG.error(msg, e);
673       throw new RestoreSnapshotException(msg, e);
674     }
675   }
676 
677   /**
678    * Restore the specified snapshot
679    * @param reqSnapshot
680    * @throws IOException
681    */
682   public void restoreSnapshot(SnapshotDescription reqSnapshot) throws IOException {
683     FileSystem fs = master.getMasterFileSystem().getFileSystem();
684     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir);
685     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
686 
687     // check if the snapshot exists
688     if (!fs.exists(snapshotDir)) {
689       LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist.");
690       throw new SnapshotDoesNotExistException(reqSnapshot);
691     }
692 
693     // read snapshot information
694     SnapshotDescription fsSnapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
695     HTableDescriptor snapshotTableDesc = FSTableDescriptors.getTableDescriptor(fs, snapshotDir);
696     String tableName = reqSnapshot.getTable();
697 
698     // stop tracking "abandoned" handlers
699     cleanupSentinels();
700 
701     // Verify snapshot validity
702     SnapshotReferenceUtil.verifySnapshot(master.getConfiguration(), fs, snapshotDir, fsSnapshot);
703 
704     // Execute the restore/clone operation
705     if (MetaReader.tableExists(master.getCatalogTracker(), tableName)) {
706       if (master.getAssignmentManager().getZKTable().isEnabledTable(fsSnapshot.getTable())) {
707         throw new UnsupportedOperationException("Table '" +
708           fsSnapshot.getTable() + "' must be disabled in order to perform a restore operation.");
709       }
710 
711       // call coproc pre hook
712       if (cpHost != null) {
713         cpHost.preRestoreSnapshot(reqSnapshot, snapshotTableDesc);
714       }
715       restoreSnapshot(fsSnapshot, snapshotTableDesc);
716       LOG.info("Restore snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
717 
718       if (cpHost != null) {
719         cpHost.postRestoreSnapshot(reqSnapshot, snapshotTableDesc);
720       }
721     } else {
722       HTableDescriptor htd = RestoreSnapshotHelper.cloneTableSchema(snapshotTableDesc,
723                                                          Bytes.toBytes(tableName));
724       if (cpHost != null) {
725         cpHost.preCloneSnapshot(reqSnapshot, htd);
726       }
727       cloneSnapshot(fsSnapshot, htd);
728       LOG.info("Clone snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
729 
730       if (cpHost != null) {
731         cpHost.postCloneSnapshot(reqSnapshot, htd);
732       }
733     }
734   }
735 
736   /**
737    * Restore the specified snapshot.
738    * The restore will fail if the destination table has a snapshot or restore in progress.
739    *
740    * @param snapshot Snapshot Descriptor
741    * @param hTableDescriptor Table Descriptor
742    */
743   private synchronized void restoreSnapshot(final SnapshotDescription snapshot,
744       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
745     String tableName = hTableDescriptor.getNameAsString();
746 
747     // make sure we aren't running a snapshot on the same table
748     if (isTakingSnapshot(tableName)) {
749       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
750     }
751 
752     // make sure we aren't running a restore on the same table
753     if (isRestoringTable(tableName)) {
754       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
755     }
756 
757     try {
758       RestoreSnapshotHandler handler =
759         new RestoreSnapshotHandler(master, snapshot, hTableDescriptor, metricsMaster);
760       this.executorService.submit(handler);
761       restoreHandlers.put(hTableDescriptor.getNameAsString(), handler);
762     } catch (Exception e) {
763       String msg = "Couldn't restore the snapshot=" + SnapshotDescriptionUtils.toString(
764           snapshot)  +
765           " on table=" + tableName;
766       LOG.error(msg, e);
767       throw new RestoreSnapshotException(msg, e);
768     }
769   }
770 
771   /**
772    * Verify if the restore of the specified table is in progress.
773    *
774    * @param tableName table under restore
775    * @return <tt>true</tt> if there is a restore in progress of the specified table.
776    */
777   private synchronized boolean isRestoringTable(final String tableName) {
778     SnapshotSentinel sentinel = this.restoreHandlers.get(tableName);
779     return(sentinel != null && !sentinel.isFinished());
780   }
781 
782   /**
783    * Returns the status of a restore operation.
784    * If the in-progress restore is failed throws the exception that caused the failure.
785    *
786    * @param snapshot
787    * @return false if in progress, true if restore is completed or not requested.
788    * @throws IOException if there was a failure during the restore
789    */
790   public boolean isRestoreDone(final SnapshotDescription snapshot) throws IOException {
791     // check to see if the sentinel exists,
792     // and if the task is complete removes it from the in-progress restore map.
793     SnapshotSentinel sentinel = removeSentinelIfFinished(this.restoreHandlers, snapshot);
794 
795     // stop tracking "abandoned" handlers
796     cleanupSentinels();
797 
798     if (sentinel == null) {
799       // there is no sentinel so restore is not in progress.
800       return true;
801     }
802 
803     LOG.debug("Verify snapshot=" + snapshot.getName() + " against="
804         + sentinel.getSnapshot().getName() + " table=" + snapshot.getTable());
805 
806     // If the restore is failed, rethrow the exception
807     sentinel.rethrowExceptionIfFailed();
808 
809     // check to see if we are done
810     if (sentinel.isFinished()) {
811       LOG.debug("Restore snapshot=" + SnapshotDescriptionUtils.toString(snapshot) +
812           " has completed. Notifying the client.");
813       return true;
814     }
815 
816     if (LOG.isDebugEnabled()) {
817       LOG.debug("Sentinel is not yet finished with restoring snapshot=" +
818           SnapshotDescriptionUtils.toString(snapshot));
819     }
820     return false;
821   }
822 
823   /**
824    * Return the handler if it is currently live and has the same snapshot target name.
825    * The handler is removed from the sentinels map if completed.
826    * @param sentinels live handlers
827    * @param snapshot snapshot description
828    * @return null if doesn't match, else a live handler.
829    */
830   private synchronized SnapshotSentinel removeSentinelIfFinished(
831       final Map<String, SnapshotSentinel> sentinels, final SnapshotDescription snapshot) {
832     SnapshotSentinel h = sentinels.get(snapshot.getTable());
833     if (h == null) {
834       return null;
835     }
836 
837     if (!h.getSnapshot().getName().equals(snapshot.getName())) {
838       // specified snapshot is to the one currently running
839       return null;
840     }
841 
842     // Remove from the "in-progress" list once completed
843     if (h.isFinished()) {
844       sentinels.remove(snapshot.getTable());
845     }
846 
847     return h;
848   }
849 
850   /**
851    * Removes "abandoned" snapshot/restore requests.
852    * As part of the HBaseAdmin snapshot/restore API the operation status is checked until completed,
853    * and the in-progress maps are cleaned up when the status of a completed task is requested.
854    * To avoid having sentinels staying around for long time if something client side is failed,
855    * each operation tries to clean up the in-progress maps sentinels finished from a long time.
856    */
857   private void cleanupSentinels() {
858     cleanupSentinels(this.snapshotHandlers);
859     cleanupSentinels(this.restoreHandlers);
860   }
861 
862   /**
863    * Remove the sentinels that are marked as finished and the completion time
864    * has exceeded the removal timeout.
865    * @param sentinels map of sentinels to clean
866    */
867   private synchronized void cleanupSentinels(final Map<String, SnapshotSentinel> sentinels) {
868     long currentTime = EnvironmentEdgeManager.currentTimeMillis();
869     Iterator<Map.Entry<String, SnapshotSentinel>> it = sentinels.entrySet().iterator();
870     while (it.hasNext()) {
871       Map.Entry<String, SnapshotSentinel> entry = it.next();
872       SnapshotSentinel sentinel = entry.getValue();
873       if (sentinel.isFinished() &&
874           (currentTime - sentinel.getCompletionTimestamp()) > SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT)
875       {
876         it.remove();
877       }
878     }
879   }
880 
881   //
882   // Implementing Stoppable interface
883   //
884 
885   @Override
886   public void stop(String why) {
887     // short circuit
888     if (this.stopped) return;
889     // make sure we get stop
890     this.stopped = true;
891     // pass the stop onto take snapshot handlers
892     for (SnapshotSentinel snapshotHandler: this.snapshotHandlers.values()) {
893       snapshotHandler.cancel(why);
894     }
895 
896     // pass the stop onto all the restore handlers
897     for (SnapshotSentinel restoreHandler: this.restoreHandlers.values()) {
898       restoreHandler.cancel(why);
899     }
900 
901     try {
902       if (coordinator != null) {
903         coordinator.close();
904       }
905     } catch (IOException e) {
906       LOG.error("stop ProcedureCoordinator error", e);
907     }
908   }
909 
910   @Override
911   public boolean isStopped() {
912     return this.stopped;
913   }
914 
915   /**
916    * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported.
917    * Called at the beginning of snapshot() and restoreSnapshot() methods.
918    * @throws UnsupportedOperationException if snapshot are not supported
919    */
920   public void checkSnapshotSupport() throws UnsupportedOperationException {
921     if (!this.isSnapshotSupported) {
922       throw new UnsupportedOperationException(
923         "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" +
924           HBASE_SNAPSHOT_ENABLED + "' property with value 'true'.");
925     }
926   }
927 
928   /**
929    * Called at startup, to verify if snapshot operation is supported, and to avoid
930    * starting the master if there're snapshots present but the cleaners needed are missing.
931    * Otherwise we can end up with snapshot data loss.
932    * @param conf The {@link Configuration} object to use
933    * @param mfs The MasterFileSystem to use
934    * @throws IOException in case of file-system operation failure
935    * @throws UnsupportedOperationException in case cleaners are missing and
936    *         there're snapshot in the system
937    */
938   private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs)
939       throws IOException, UnsupportedOperationException {
940     // Verify if snapshot is disabled by the user
941     String enabled = conf.get(HBASE_SNAPSHOT_ENABLED);
942     boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false);
943     boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled);
944 
945     // Extract cleaners from conf
946     Set<String> hfileCleaners = new HashSet<String>();
947     String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS);
948     if (cleaners != null) Collections.addAll(hfileCleaners, cleaners);
949 
950     Set<String> logCleaners = new HashSet<String>();
951     cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS);
952     if (cleaners != null) Collections.addAll(logCleaners, cleaners);
953 
954     // check if an older version of snapshot directory was present
955     Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME);
956     FileSystem fs = mfs.getFileSystem();
957     List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir));
958     if (ss != null && !ss.isEmpty()) {
959       LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir);
960       LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME);
961     }
962 
963     // If the user has enabled the snapshot, we force the cleaners to be present
964     // otherwise we still need to check if cleaners are enabled or not and verify
965     // that there're no snapshot in the .snapshot folder.
966     if (snapshotEnabled) {
967       // Inject snapshot cleaners, if snapshot.enable is true
968       hfileCleaners.add(SnapshotHFileCleaner.class.getName());
969       hfileCleaners.add(HFileLinkCleaner.class.getName());
970       logCleaners.add(SnapshotLogCleaner.class.getName());
971 
972       // Set cleaners conf
973       conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS,
974         hfileCleaners.toArray(new String[hfileCleaners.size()]));
975       conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS,
976         logCleaners.toArray(new String[logCleaners.size()]));
977     } else {
978       // Verify if cleaners are present
979       snapshotEnabled = logCleaners.contains(SnapshotLogCleaner.class.getName()) &&
980         hfileCleaners.contains(SnapshotHFileCleaner.class.getName()) &&
981         hfileCleaners.contains(HFileLinkCleaner.class.getName());
982 
983       // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set.
984       if (snapshotEnabled) {
985         LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " +
986           "but the '" + HBASE_SNAPSHOT_ENABLED + "' property " +
987           (userDisabled ? "is set to 'false'." : "is not set."));
988       }
989     }
990 
991     // Mark snapshot feature as enabled if cleaners are present and user has not disabled it.
992     this.isSnapshotSupported = snapshotEnabled && !userDisabled;
993 
994     // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder
995     // otherwise we end up with snapshot data loss.
996     if (!snapshotEnabled) {
997       LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners.");
998       Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir());
999       if (fs.exists(snapshotDir)) {
1000         FileStatus[] snapshots = FSUtils.listStatus(fs, snapshotDir,
1001           new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
1002         if (snapshots != null) {
1003           LOG.error("Snapshots are present, but cleaners are not enabled.");
1004           checkSnapshotSupport();
1005         }
1006       }
1007     }
1008   }
1009 }