001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.snapshot;
019
020import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
021import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_TIMEOUT_KEY;
022
023import java.io.IOException;
024import java.util.List;
025import java.util.concurrent.CancellationException;
026import org.apache.hadoop.conf.Configuration;
027import org.apache.hadoop.fs.FileSystem;
028import org.apache.hadoop.fs.Path;
029import org.apache.hadoop.hbase.DoNotRetryIOException;
030import org.apache.hadoop.hbase.ServerName;
031import org.apache.hadoop.hbase.TableName;
032import org.apache.hadoop.hbase.client.RegionInfo;
033import org.apache.hadoop.hbase.client.TableDescriptor;
034import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
035import org.apache.hadoop.hbase.errorhandling.ForeignException;
036import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
037import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
038import org.apache.hadoop.hbase.executor.EventHandler;
039import org.apache.hadoop.hbase.executor.EventType;
040import org.apache.hadoop.hbase.master.MasterServices;
041import org.apache.hadoop.hbase.master.MetricsSnapshot;
042import org.apache.hadoop.hbase.master.SnapshotSentinel;
043import org.apache.hadoop.hbase.master.locking.LockManager;
044import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock;
045import org.apache.hadoop.hbase.monitoring.MonitoredTask;
046import org.apache.hadoop.hbase.monitoring.TaskMonitor;
047import org.apache.hadoop.hbase.procedure2.LockType;
048import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
049import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
050import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
051import org.apache.hadoop.hbase.snapshot.SnapshotTTLExpiredException;
052import org.apache.hadoop.hbase.util.CommonFSUtils;
053import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
054import org.apache.hadoop.hbase.util.Pair;
055import org.apache.yetus.audience.InterfaceAudience;
056import org.apache.zookeeper.KeeperException;
057import org.slf4j.Logger;
058import org.slf4j.LoggerFactory;
059
060import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
061
062import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
063import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
064
065/**
066 * A handler for taking snapshots from the master. This is not a subclass of TableEventHandler
067 * because using that would incur an extra hbase:meta scan. The {@link #snapshotRegions(List)} call
068 * should get implemented for each snapshot flavor.
069 */
070@InterfaceAudience.Private
071public abstract class TakeSnapshotHandler extends EventHandler
072  implements SnapshotSentinel, ForeignExceptionSnare {
073  private static final Logger LOG = LoggerFactory.getLogger(TakeSnapshotHandler.class);
074  public static final String HBASE_SNAPSHOT_MASTER_LOCK_ACQUIRE_TIMEOUT =
075    "hbase.snapshot.master.lock.acquire.timeout";
076
077  private volatile boolean finished;
078
079  // none of these should ever be null
080  protected final MasterServices master;
081  protected final MetricsSnapshot metricsSnapshot = new MetricsSnapshot();
082  protected final SnapshotDescription snapshot;
083  protected final Configuration conf;
084  protected final FileSystem rootFs;
085  protected final FileSystem workingDirFs;
086  protected final Path rootDir;
087  private final Path snapshotDir;
088  protected final Path workingDir;
089  private final MasterSnapshotVerifier verifier;
090  protected final ForeignExceptionDispatcher monitor;
091  private final LockManager.MasterLock tableLock;
092  protected final MonitoredTask status;
093  protected final TableName snapshotTable;
094  protected final SnapshotManifest snapshotManifest;
095  protected final SnapshotManager snapshotManager;
096  /**
097   * Snapshot creation requires table lock. If any region of the table is in transition, table lock
098   * cannot be acquired by LockProcedure and hence snapshot creation could hang for potentially very
099   * long time. This timeout will ensure snapshot creation fails-fast by waiting for only given
100   * timeout.
101   */
102  private final long lockAcquireTimeoutMs;
103
104  protected TableDescriptor htd;
105
106  /**
107   * @param snapshot       descriptor of the snapshot to take
108   * @param masterServices master services provider
109   * @throws IllegalArgumentException if the working snapshot directory set from the configuration
110   *                                  is the same as the completed snapshot directory
111   * @throws IOException              if the file system of the working snapshot directory cannot be
112   *                                  determined
113   */
114  public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices masterServices,
115    final SnapshotManager snapshotManager) throws IOException {
116    super(masterServices, EventType.C_M_SNAPSHOT_TABLE);
117    assert snapshot != null : "SnapshotDescription must not be nul1";
118    assert masterServices != null : "MasterServices must not be nul1";
119    this.master = masterServices;
120    this.conf = this.master.getConfiguration();
121    this.rootDir = this.master.getMasterFileSystem().getRootDir();
122    this.workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, conf);
123    Preconditions.checkArgument(
124      !SnapshotDescriptionUtils.isSubDirectoryOf(workingDir, rootDir)
125        || SnapshotDescriptionUtils.isWithinDefaultWorkingDir(workingDir, conf),
126      "The working directory " + workingDir + " cannot be in the root directory unless it is "
127        + "within the default working directory");
128
129    this.snapshot = snapshot;
130    this.snapshotManager = snapshotManager;
131    this.snapshotTable = TableName.valueOf(snapshot.getTable());
132    this.rootFs = this.master.getMasterFileSystem().getFileSystem();
133    this.snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
134    this.workingDirFs = this.workingDir.getFileSystem(this.conf);
135    this.monitor = new ForeignExceptionDispatcher(snapshot.getName());
136
137    this.tableLock = master.getLockManager().createMasterLock(snapshotTable, LockType.EXCLUSIVE,
138      this.getClass().getName() + ": take snapshot " + snapshot.getName());
139
140    // prepare the verify
141    this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs);
142    // update the running tasks
143    this.status = TaskMonitor.get().createStatus(
144      "Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable, false, true);
145    this.snapshotManifest =
146      SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor, status);
147    this.lockAcquireTimeoutMs = conf.getLong(HBASE_SNAPSHOT_MASTER_LOCK_ACQUIRE_TIMEOUT,
148      conf.getLong(HBASE_RPC_TIMEOUT_KEY, DEFAULT_HBASE_RPC_TIMEOUT));
149  }
150
151  private TableDescriptor loadTableDescriptor() throws IOException {
152    TableDescriptor htd = this.master.getTableDescriptors().get(snapshotTable);
153    if (htd == null) {
154      throw new IOException("TableDescriptor missing for " + snapshotTable);
155    }
156    if (htd.getMaxFileSize() == -1 && this.snapshot.getMaxFileSize() > 0) {
157      htd = TableDescriptorBuilder.newBuilder(htd).setValue(TableDescriptorBuilder.MAX_FILESIZE,
158        Long.toString(this.snapshot.getMaxFileSize())).build();
159    }
160    return htd;
161  }
162
163  @Override
164  public TakeSnapshotHandler prepare() throws Exception {
165    super.prepare();
166    // after this, you should ensure to release this lock in case of exceptions
167    if (this.tableLock.tryAcquire(this.lockAcquireTimeoutMs)) {
168      try {
169        this.htd = loadTableDescriptor(); // check that .tableinfo is present
170      } catch (Exception e) {
171        this.tableLock.release();
172        throw e;
173      }
174    } else {
175      LOG.error("Master lock could not be acquired in {} ms", lockAcquireTimeoutMs);
176      throw new DoNotRetryIOException("Master lock could not be acquired");
177    }
178    return this;
179  }
180
181  /**
182   * Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)} call
183   * should get implemented for each snapshot flavor.
184   */
185  @Override
186  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "REC_CATCH_EXCEPTION",
187      justification = "Intentional")
188  public void process() {
189    String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " "
190      + eventType + " on table " + snapshotTable;
191    LOG.info(msg);
192    MasterLock tableLockToRelease = this.tableLock;
193    status.setStatus(msg);
194    try {
195      if (downgradeToSharedTableLock()) {
196        // release the exclusive lock and hold the shared lock instead
197        tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable,
198          LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName());
199        tableLock.release();
200        boolean isTableLockAcquired = tableLockToRelease.tryAcquire(this.lockAcquireTimeoutMs);
201        if (!isTableLockAcquired) {
202          LOG.error("Could not acquire shared lock on table {} in {} ms", snapshotTable,
203            lockAcquireTimeoutMs);
204          throw new IOException("Could not acquire shared lock on table " + snapshotTable);
205        }
206      }
207      // If regions move after this meta scan, the region specific snapshot should fail, triggering
208      // an external exception that gets captured here.
209
210      // write down the snapshot info in the working directory
211      SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, workingDirFs);
212      snapshotManifest.addTableDescriptor(this.htd);
213      monitor.rethrowException();
214
215      List<Pair<RegionInfo, ServerName>> regionsAndLocations =
216        master.getAssignmentManager().getTableRegionsAndLocations(snapshotTable, false);
217
218      // run the snapshot
219      snapshotRegions(regionsAndLocations);
220      monitor.rethrowException();
221
222      // flush the in-memory state, and write the single manifest
223      status.setStatus("Consolidate snapshot: " + snapshot.getName());
224      snapshotManifest.consolidate();
225
226      // verify the snapshot is valid
227      status.setStatus("Verifying snapshot: " + snapshot.getName());
228      verifier.verifySnapshot(workingDir, true);
229
230      // HBASE-29296 check snapshot is not expired
231      if (
232        SnapshotDescriptionUtils.isExpiredSnapshot(snapshot.getTtl(), snapshot.getCreationTime(),
233          EnvironmentEdgeManager.currentTime())
234      ) {
235        throw new SnapshotTTLExpiredException(ProtobufUtil.createSnapshotDesc(snapshot));
236      }
237
238      // complete the snapshot, atomically moving from tmp to .snapshot dir.
239      SnapshotDescriptionUtils.completeSnapshot(this.snapshotDir, this.workingDir, this.rootFs,
240        this.workingDirFs, this.conf);
241      finished = true;
242      msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed";
243      status.markComplete(msg);
244      LOG.info(msg);
245      metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime());
246      if (master.getMasterCoprocessorHost() != null) {
247        master.getMasterCoprocessorHost()
248          .postCompletedSnapshotAction(ProtobufUtil.createSnapshotDesc(snapshot), this.htd);
249      }
250    } catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION
251      status.abort("Failed to complete snapshot " + snapshot.getName() + " on table "
252        + snapshotTable + " because " + e.getMessage());
253      String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot)
254        + " due to exception:" + e.getMessage();
255      LOG.error(reason, e);
256      ForeignException ee = new ForeignException(reason, e);
257      monitor.receive(ee);
258      // need to mark this completed to close off and allow cleanup to happen.
259      cancel(reason);
260    } finally {
261      LOG.debug("Launching cleanup of working dir:" + workingDir);
262      try {
263        // if the working dir is still present, the snapshot has failed. it is present we delete
264        // it.
265        if (workingDirFs.exists(workingDir) && !workingDirFs.delete(workingDir, true)) {
266          LOG.error("Couldn't delete snapshot working directory: {}", workingDir);
267        }
268      } catch (IOException e) {
269        LOG.error("Couldn't get or delete snapshot working directory: {}", workingDir, e);
270      }
271      if (LOG.isDebugEnabled()) {
272        LOG.debug("Table snapshot journal : \n" + status.prettyPrintJournal());
273      }
274      tableLockToRelease.release();
275    }
276  }
277
278  /**
279   * When taking snapshot, first we must acquire the exclusive table lock to confirm that there are
280   * no ongoing merge/split procedures. But later, we should try our best to release the exclusive
281   * lock as this may hurt the availability, because we need to hold the shared lock when assigning
282   * regions.
283   * <p/>
284   * See HBASE-21480 for more details.
285   */
286  protected abstract boolean downgradeToSharedTableLock();
287
288  /**
289   * Snapshot the specified regions
290   */
291  protected abstract void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions)
292    throws IOException, KeeperException;
293
294  /**
295   * Take a snapshot of the specified disabled region
296   */
297  protected void snapshotDisabledRegion(final RegionInfo regionInfo) throws IOException {
298    snapshotManifest.addRegion(CommonFSUtils.getTableDir(rootDir, snapshotTable), regionInfo);
299    monitor.rethrowException();
300    status.setStatus("Completed referencing HFiles for offline region " + regionInfo.toString()
301      + " of table: " + snapshotTable);
302  }
303
304  @Override
305  public void cancel(String why) {
306    if (finished) return;
307
308    this.finished = true;
309    LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot)
310      + " because: " + why);
311    CancellationException ce = new CancellationException(why);
312    monitor.receive(new ForeignException(master.getServerName().toString(), ce));
313  }
314
315  @Override
316  public boolean isFinished() {
317    return finished;
318  }
319
320  @Override
321  public long getCompletionTimestamp() {
322    return this.status.getCompletionTimestamp();
323  }
324
325  @Override
326  public SnapshotDescription getSnapshot() {
327    return snapshot;
328  }
329
330  @Override
331  public ForeignException getExceptionIfFailed() {
332    return monitor.getException();
333  }
334
335  @Override
336  public void rethrowExceptionIfFailed() throws ForeignException {
337    monitor.rethrowException();
338  }
339
340  @Override
341  public void rethrowException() throws ForeignException {
342    monitor.rethrowException();
343  }
344
345  @Override
346  public boolean hasException() {
347    return monitor.hasException();
348  }
349
350  @Override
351  public ForeignException getException() {
352    return monitor.getException();
353  }
354}