001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.snapshot;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.net.URI;
023import java.util.HashSet;
024import java.util.List;
025import java.util.Set;
026import java.util.concurrent.CancellationException;
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.fs.FileSystem;
029import org.apache.hadoop.fs.FileUtil;
030import org.apache.hadoop.fs.Path;
031import org.apache.hadoop.hbase.MetaTableAccessor;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.RegionInfo;
035import org.apache.hadoop.hbase.client.TableDescriptor;
036import org.apache.hadoop.hbase.errorhandling.ForeignException;
037import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
038import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
039import org.apache.hadoop.hbase.executor.EventHandler;
040import org.apache.hadoop.hbase.executor.EventType;
041import org.apache.hadoop.hbase.master.MasterServices;
042import org.apache.hadoop.hbase.master.MetricsSnapshot;
043import org.apache.hadoop.hbase.master.SnapshotSentinel;
044import org.apache.hadoop.hbase.master.locking.LockManager;
045import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock;
046import org.apache.hadoop.hbase.monitoring.MonitoredTask;
047import org.apache.hadoop.hbase.monitoring.TaskMonitor;
048import org.apache.hadoop.hbase.procedure2.LockType;
049import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
050import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
051import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
052import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
053import org.apache.hadoop.hbase.util.FSUtils;
054import org.apache.hadoop.hbase.util.Pair;
055import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
056import org.apache.yetus.audience.InterfaceAudience;
057import org.apache.zookeeper.KeeperException;
058import org.slf4j.Logger;
059import org.slf4j.LoggerFactory;
060
061import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
062
063import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
064
065/**
066 * A handler for taking snapshots from the master.
067 *
068 * This is not a subclass of TableEventHandler because using that would incur an extra hbase:meta scan.
069 *
070 * The {@link #snapshotRegions(List)} call should get implemented for each snapshot flavor.
071 */
072@InterfaceAudience.Private
073public abstract class TakeSnapshotHandler extends EventHandler implements SnapshotSentinel,
074    ForeignExceptionSnare {
075  private static final Logger LOG = LoggerFactory.getLogger(TakeSnapshotHandler.class);
076
077  private volatile boolean finished;
078
079  // none of these should ever be null
080  protected final MasterServices master;
081  protected final MetricsSnapshot metricsSnapshot = new MetricsSnapshot();
082  protected final SnapshotDescription snapshot;
083  protected final Configuration conf;
084  protected final FileSystem rootFs;
085  protected final FileSystem workingDirFs;
086  protected final Path rootDir;
087  private final Path snapshotDir;
088  protected final Path workingDir;
089  private final MasterSnapshotVerifier verifier;
090  protected final ForeignExceptionDispatcher monitor;
091  private final LockManager.MasterLock tableLock;
092  protected final MonitoredTask status;
093  protected final TableName snapshotTable;
094  protected final SnapshotManifest snapshotManifest;
095  protected final SnapshotManager snapshotManager;
096
097  protected TableDescriptor htd;
098
099  /**
100   * @param snapshot descriptor of the snapshot to take
101   * @param masterServices master services provider
102   * @throws IllegalArgumentException if the working snapshot directory set from the
103   *   configuration is the same as the completed snapshot directory
104   * @throws IOException if the file system of the working snapshot directory cannot be
105   *   determined
106   */
107  public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices masterServices,
108                             final SnapshotManager snapshotManager) throws IOException {
109    super(masterServices, EventType.C_M_SNAPSHOT_TABLE);
110    assert snapshot != null : "SnapshotDescription must not be nul1";
111    assert masterServices != null : "MasterServices must not be nul1";
112    this.master = masterServices;
113    this.conf = this.master.getConfiguration();
114    this.rootDir = this.master.getMasterFileSystem().getRootDir();
115    this.workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, conf);
116    Preconditions.checkArgument(!SnapshotDescriptionUtils.isSubDirectoryOf(workingDir, rootDir) ||
117            SnapshotDescriptionUtils.isWithinDefaultWorkingDir(workingDir, conf),
118        "The working directory " + workingDir + " cannot be in the root directory unless it is "
119            + "within the default working directory");
120
121    this.snapshot = snapshot;
122    this.snapshotManager = snapshotManager;
123    this.snapshotTable = TableName.valueOf(snapshot.getTable());
124    this.rootFs = this.master.getMasterFileSystem().getFileSystem();
125    this.snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
126    this.workingDirFs = this.workingDir.getFileSystem(this.conf);
127    this.monitor = new ForeignExceptionDispatcher(snapshot.getName());
128    this.snapshotManifest = SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor);
129
130    this.tableLock = master.getLockManager().createMasterLock(
131        snapshotTable, LockType.EXCLUSIVE,
132        this.getClass().getName() + ": take snapshot " + snapshot.getName());
133
134    // prepare the verify
135    this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs);
136    // update the running tasks
137    this.status = TaskMonitor.get().createStatus(
138      "Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable);
139  }
140
141  private TableDescriptor loadTableDescriptor()
142      throws FileNotFoundException, IOException {
143    TableDescriptor htd =
144      this.master.getTableDescriptors().get(snapshotTable);
145    if (htd == null) {
146      throw new IOException("TableDescriptor missing for " + snapshotTable);
147    }
148    return htd;
149  }
150
151  @Override
152  public TakeSnapshotHandler prepare() throws Exception {
153    super.prepare();
154    // after this, you should ensure to release this lock in case of exceptions
155    this.tableLock.acquire();
156    try {
157      this.htd = loadTableDescriptor(); // check that .tableinfo is present
158    } catch (Exception e) {
159      this.tableLock.release();
160      throw e;
161    }
162    return this;
163  }
164
165  /**
166   * Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)}
167   * call should get implemented for each snapshot flavor.
168   */
169  @Override
170  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="REC_CATCH_EXCEPTION",
171    justification="Intentional")
172  public void process() {
173    String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " "
174        + eventType + " on table " + snapshotTable;
175    LOG.info(msg);
176    MasterLock tableLockToRelease = this.tableLock;
177    status.setStatus(msg);
178    try {
179      if (downgradeToSharedTableLock()) {
180        // release the exclusive lock and hold the shared lock instead
181        tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable,
182          LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName());
183        tableLock.release();
184        tableLockToRelease.acquire();
185      }
186      // If regions move after this meta scan, the region specific snapshot should fail, triggering
187      // an external exception that gets captured here.
188
189      // write down the snapshot info in the working directory
190      SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, workingDirFs);
191      snapshotManifest.addTableDescriptor(this.htd);
192      monitor.rethrowException();
193
194      List<Pair<RegionInfo, ServerName>> regionsAndLocations;
195      if (TableName.META_TABLE_NAME.equals(snapshotTable)) {
196        regionsAndLocations = MetaTableLocator.getMetaRegionsAndLocations(
197          server.getZooKeeper());
198      } else {
199        regionsAndLocations = MetaTableAccessor.getTableRegionsAndLocations(
200          server.getConnection(), snapshotTable, false);
201      }
202
203      // run the snapshot
204      snapshotRegions(regionsAndLocations);
205      monitor.rethrowException();
206
207      // extract each pair to separate lists
208      Set<String> serverNames = new HashSet<>();
209      for (Pair<RegionInfo, ServerName> p : regionsAndLocations) {
210        if (p != null && p.getFirst() != null && p.getSecond() != null) {
211          RegionInfo hri = p.getFirst();
212          if (hri.isOffline() && (hri.isSplit() || hri.isSplitParent())) continue;
213          serverNames.add(p.getSecond().toString());
214        }
215      }
216
217      // flush the in-memory state, and write the single manifest
218      status.setStatus("Consolidate snapshot: " + snapshot.getName());
219      snapshotManifest.consolidate();
220
221      // verify the snapshot is valid
222      status.setStatus("Verifying snapshot: " + snapshot.getName());
223      verifier.verifySnapshot(this.workingDir, serverNames);
224
225      // complete the snapshot, atomically moving from tmp to .snapshot dir.
226      completeSnapshot(this.snapshotDir, this.workingDir, this.rootFs, this.workingDirFs);
227      msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed";
228      status.markComplete(msg);
229      LOG.info(msg);
230      metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime());
231    } catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION
232      status.abort("Failed to complete snapshot " + snapshot.getName() + " on table " +
233          snapshotTable + " because " + e.getMessage());
234      String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot)
235          + " due to exception:" + e.getMessage();
236      LOG.error(reason, e);
237      ForeignException ee = new ForeignException(reason, e);
238      monitor.receive(ee);
239      // need to mark this completed to close off and allow cleanup to happen.
240      cancel(reason);
241    } finally {
242      LOG.debug("Launching cleanup of working dir:" + workingDir);
243      try {
244        // if the working dir is still present, the snapshot has failed.  it is present we delete
245        // it.
246        if (!workingDirFs.delete(workingDir, true)) {
247          LOG.error("Couldn't delete snapshot working directory:" + workingDir);
248        }
249      } catch (IOException e) {
250        LOG.error("Couldn't delete snapshot working directory:" + workingDir);
251      }
252      tableLockToRelease.release();
253    }
254  }
255
256  /**
257   * Reset the manager to allow another snapshot to proceed.
258   * Commits the snapshot process by moving the working snapshot
259   * to the finalized filepath
260   *
261   * @param snapshotDir The file path of the completed snapshots
262   * @param workingDir  The file path of the in progress snapshots
263   * @param fs The file system of the completed snapshots
264   * @param workingDirFs The file system of the in progress snapshots
265   *
266   * @throws SnapshotCreationException if the snapshot could not be moved
267   * @throws IOException the filesystem could not be reached
268   */
269  public void completeSnapshot(Path snapshotDir, Path workingDir, FileSystem fs,
270      FileSystem workingDirFs) throws SnapshotCreationException, IOException {
271    LOG.debug("Sentinel is done, just moving the snapshot from " + workingDir + " to "
272        + snapshotDir);
273    // If the working and completed snapshot directory are on the same file system, attempt
274    // to rename the working snapshot directory to the completed location. If that fails,
275    // or the file systems differ, attempt to copy the directory over, throwing an exception
276    // if this fails
277    URI workingURI = workingDirFs.getUri();
278    URI rootURI = fs.getUri();
279    if ((!workingURI.getScheme().equals(rootURI.getScheme()) ||
280        workingURI.getAuthority() == null ||
281        !workingURI.getAuthority().equals(rootURI.getAuthority()) ||
282        workingURI.getUserInfo() == null ||
283        !workingURI.getUserInfo().equals(rootURI.getUserInfo()) ||
284        !fs.rename(workingDir, snapshotDir)) && !FileUtil.copy(workingDirFs, workingDir, fs,
285        snapshotDir, true, true, this.conf)) {
286      throw new SnapshotCreationException("Failed to copy working directory(" + workingDir
287          + ") to completed directory(" + snapshotDir + ").");
288    }
289    finished = true;
290  }
291
292  /**
293   * When taking snapshot, first we must acquire the exclusive table lock to confirm that there are
294   * no ongoing merge/split procedures. But later, we should try our best to release the exclusive
295   * lock as this may hurt the availability, because we need to hold the shared lock when assigning
296   * regions.
297   * <p/>
298   * See HBASE-21480 for more details.
299   */
300  protected abstract boolean downgradeToSharedTableLock();
301
302  /**
303   * Snapshot the specified regions
304   */
305  protected abstract void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions)
306      throws IOException, KeeperException;
307
308  /**
309   * Take a snapshot of the specified disabled region
310   */
311  protected void snapshotDisabledRegion(final RegionInfo regionInfo)
312      throws IOException {
313    snapshotManifest.addRegion(FSUtils.getTableDir(rootDir, snapshotTable), regionInfo);
314    monitor.rethrowException();
315    status.setStatus("Completed referencing HFiles for offline region " + regionInfo.toString() +
316        " of table: " + snapshotTable);
317  }
318
319  @Override
320  public void cancel(String why) {
321    if (finished) return;
322
323    this.finished = true;
324    LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
325        " because: " + why);
326    CancellationException ce = new CancellationException(why);
327    monitor.receive(new ForeignException(master.getServerName().toString(), ce));
328  }
329
330  @Override
331  public boolean isFinished() {
332    return finished;
333  }
334
335  @Override
336  public long getCompletionTimestamp() {
337    return this.status.getCompletionTimestamp();
338  }
339
340  @Override
341  public SnapshotDescription getSnapshot() {
342    return snapshot;
343  }
344
345  @Override
346  public ForeignException getExceptionIfFailed() {
347    return monitor.getException();
348  }
349
350  @Override
351  public void rethrowExceptionIfFailed() throws ForeignException {
352    monitor.rethrowException();
353  }
354
355  @Override
356  public void rethrowException() throws ForeignException {
357    monitor.rethrowException();
358  }
359
360  @Override
361  public boolean hasException() {
362    return monitor.hasException();
363  }
364
365  @Override
366  public ForeignException getException() {
367    return monitor.getException();
368  }
369}