001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.snapshot;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.net.URI;
023import java.util.HashSet;
024import java.util.List;
025import java.util.Set;
026import java.util.concurrent.CancellationException;
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.fs.FileSystem;
029import org.apache.hadoop.fs.FileUtil;
030import org.apache.hadoop.fs.Path;
031import org.apache.hadoop.hbase.MetaTableAccessor;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.RegionInfo;
035import org.apache.hadoop.hbase.client.TableDescriptor;
036import org.apache.hadoop.hbase.errorhandling.ForeignException;
037import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
038import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
039import org.apache.hadoop.hbase.executor.EventHandler;
040import org.apache.hadoop.hbase.executor.EventType;
041import org.apache.hadoop.hbase.master.MasterServices;
042import org.apache.hadoop.hbase.master.MetricsSnapshot;
043import org.apache.hadoop.hbase.master.SnapshotSentinel;
044import org.apache.hadoop.hbase.master.locking.LockManager;
045import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock;
046import org.apache.hadoop.hbase.monitoring.MonitoredTask;
047import org.apache.hadoop.hbase.monitoring.TaskMonitor;
048import org.apache.hadoop.hbase.procedure2.LockType;
049import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
050import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
051import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
052import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
053import org.apache.hadoop.hbase.util.CommonFSUtils;
054import org.apache.hadoop.hbase.util.Pair;
055import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
056import org.apache.yetus.audience.InterfaceAudience;
057import org.apache.zookeeper.KeeperException;
058import org.slf4j.Logger;
059import org.slf4j.LoggerFactory;
060
061import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
062
063import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
064import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
065
066/**
067 * A handler for taking snapshots from the master.
068 *
069 * This is not a subclass of TableEventHandler because using that would incur an extra hbase:meta scan.
070 *
071 * The {@link #snapshotRegions(List)} call should get implemented for each snapshot flavor.
072 */
073@InterfaceAudience.Private
074public abstract class TakeSnapshotHandler extends EventHandler implements SnapshotSentinel,
075    ForeignExceptionSnare {
076  private static final Logger LOG = LoggerFactory.getLogger(TakeSnapshotHandler.class);
077
078  private volatile boolean finished;
079
080  // none of these should ever be null
081  protected final MasterServices master;
082  protected final MetricsSnapshot metricsSnapshot = new MetricsSnapshot();
083  protected final SnapshotDescription snapshot;
084  protected final Configuration conf;
085  protected final FileSystem rootFs;
086  protected final FileSystem workingDirFs;
087  protected final Path rootDir;
088  private final Path snapshotDir;
089  protected final Path workingDir;
090  private final MasterSnapshotVerifier verifier;
091  protected final ForeignExceptionDispatcher monitor;
092  private final LockManager.MasterLock tableLock;
093  protected final MonitoredTask status;
094  protected final TableName snapshotTable;
095  protected final SnapshotManifest snapshotManifest;
096  protected final SnapshotManager snapshotManager;
097
098  protected TableDescriptor htd;
099
100  /**
101   * @param snapshot descriptor of the snapshot to take
102   * @param masterServices master services provider
103   * @throws IllegalArgumentException if the working snapshot directory set from the
104   *   configuration is the same as the completed snapshot directory
105   * @throws IOException if the file system of the working snapshot directory cannot be
106   *   determined
107   */
108  public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices masterServices,
109                             final SnapshotManager snapshotManager) throws IOException {
110    super(masterServices, EventType.C_M_SNAPSHOT_TABLE);
111    assert snapshot != null : "SnapshotDescription must not be nul1";
112    assert masterServices != null : "MasterServices must not be nul1";
113    this.master = masterServices;
114    this.conf = this.master.getConfiguration();
115    this.rootDir = this.master.getMasterFileSystem().getRootDir();
116    this.workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, conf);
117    Preconditions.checkArgument(!SnapshotDescriptionUtils.isSubDirectoryOf(workingDir, rootDir) ||
118            SnapshotDescriptionUtils.isWithinDefaultWorkingDir(workingDir, conf),
119        "The working directory " + workingDir + " cannot be in the root directory unless it is "
120            + "within the default working directory");
121
122    this.snapshot = snapshot;
123    this.snapshotManager = snapshotManager;
124    this.snapshotTable = TableName.valueOf(snapshot.getTable());
125    this.rootFs = this.master.getMasterFileSystem().getFileSystem();
126    this.snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
127    this.workingDirFs = this.workingDir.getFileSystem(this.conf);
128    this.monitor = new ForeignExceptionDispatcher(snapshot.getName());
129    this.snapshotManifest = SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor);
130
131    this.tableLock = master.getLockManager().createMasterLock(
132        snapshotTable, LockType.EXCLUSIVE,
133        this.getClass().getName() + ": take snapshot " + snapshot.getName());
134
135    // prepare the verify
136    this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs);
137    // update the running tasks
138    this.status = TaskMonitor.get().createStatus(
139      "Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable);
140  }
141
142  private TableDescriptor loadTableDescriptor()
143      throws FileNotFoundException, IOException {
144    TableDescriptor htd =
145      this.master.getTableDescriptors().get(snapshotTable);
146    if (htd == null) {
147      throw new IOException("TableDescriptor missing for " + snapshotTable);
148    }
149    return htd;
150  }
151
152  @Override
153  public TakeSnapshotHandler prepare() throws Exception {
154    super.prepare();
155    // after this, you should ensure to release this lock in case of exceptions
156    this.tableLock.acquire();
157    try {
158      this.htd = loadTableDescriptor(); // check that .tableinfo is present
159    } catch (Exception e) {
160      this.tableLock.release();
161      throw e;
162    }
163    return this;
164  }
165
166  /**
167   * Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)}
168   * call should get implemented for each snapshot flavor.
169   */
170  @Override
171  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="REC_CATCH_EXCEPTION",
172    justification="Intentional")
173  public void process() {
174    String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " "
175        + eventType + " on table " + snapshotTable;
176    LOG.info(msg);
177    MasterLock tableLockToRelease = this.tableLock;
178    status.setStatus(msg);
179    try {
180      if (downgradeToSharedTableLock()) {
181        // release the exclusive lock and hold the shared lock instead
182        tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable,
183          LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName());
184        tableLock.release();
185        tableLockToRelease.acquire();
186      }
187      // If regions move after this meta scan, the region specific snapshot should fail, triggering
188      // an external exception that gets captured here.
189
190      // write down the snapshot info in the working directory
191      SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, workingDirFs);
192      snapshotManifest.addTableDescriptor(this.htd);
193      monitor.rethrowException();
194
195      List<Pair<RegionInfo, ServerName>> regionsAndLocations;
196      if (TableName.META_TABLE_NAME.equals(snapshotTable)) {
197        regionsAndLocations = MetaTableLocator.getMetaRegionsAndLocations(
198          server.getZooKeeper());
199      } else {
200        regionsAndLocations = MetaTableAccessor.getTableRegionsAndLocations(
201          server.getConnection(), snapshotTable, false);
202      }
203
204      // run the snapshot
205      snapshotRegions(regionsAndLocations);
206      monitor.rethrowException();
207
208      // extract each pair to separate lists
209      Set<String> serverNames = new HashSet<>();
210      for (Pair<RegionInfo, ServerName> p : regionsAndLocations) {
211        if (p != null && p.getFirst() != null && p.getSecond() != null) {
212          RegionInfo hri = p.getFirst();
213          if (hri.isOffline() && (hri.isSplit() || hri.isSplitParent())) continue;
214          serverNames.add(p.getSecond().toString());
215        }
216      }
217
218      // flush the in-memory state, and write the single manifest
219      status.setStatus("Consolidate snapshot: " + snapshot.getName());
220      snapshotManifest.consolidate();
221
222      // verify the snapshot is valid
223      status.setStatus("Verifying snapshot: " + snapshot.getName());
224      verifier.verifySnapshot(this.workingDir, serverNames);
225
226      // complete the snapshot, atomically moving from tmp to .snapshot dir.
227      SnapshotDescriptionUtils.completeSnapshot(this.snapshotDir, this.workingDir, this.rootFs,
228        this.workingDirFs, this.conf);
229      finished = true;
230      msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed";
231      status.markComplete(msg);
232      LOG.info(msg);
233      metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime());
234      if (master.getMasterCoprocessorHost() != null) {
235        master.getMasterCoprocessorHost()
236            .postCompletedSnapshotAction(ProtobufUtil.createSnapshotDesc(snapshot), this.htd);
237      }
238    } catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION
239      status.abort("Failed to complete snapshot " + snapshot.getName() + " on table " +
240          snapshotTable + " because " + e.getMessage());
241      String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot)
242          + " due to exception:" + e.getMessage();
243      LOG.error(reason, e);
244      ForeignException ee = new ForeignException(reason, e);
245      monitor.receive(ee);
246      // need to mark this completed to close off and allow cleanup to happen.
247      cancel(reason);
248    } finally {
249      LOG.debug("Launching cleanup of working dir:" + workingDir);
250      try {
251        // if the working dir is still present, the snapshot has failed.  it is present we delete
252        // it.
253        if (!workingDirFs.delete(workingDir, true)) {
254          LOG.error("Couldn't delete snapshot working directory:" + workingDir);
255        }
256      } catch (IOException e) {
257        LOG.error("Couldn't delete snapshot working directory:" + workingDir);
258      }
259      tableLockToRelease.release();
260    }
261  }
262
263  /**
264   * When taking snapshot, first we must acquire the exclusive table lock to confirm that there are
265   * no ongoing merge/split procedures. But later, we should try our best to release the exclusive
266   * lock as this may hurt the availability, because we need to hold the shared lock when assigning
267   * regions.
268   * <p/>
269   * See HBASE-21480 for more details.
270   */
271  protected abstract boolean downgradeToSharedTableLock();
272
273  /**
274   * Snapshot the specified regions
275   */
276  protected abstract void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions)
277      throws IOException, KeeperException;
278
279  /**
280   * Take a snapshot of the specified disabled region
281   */
282  protected void snapshotDisabledRegion(final RegionInfo regionInfo)
283      throws IOException {
284    snapshotManifest.addRegion(CommonFSUtils.getTableDir(rootDir, snapshotTable), regionInfo);
285    monitor.rethrowException();
286    status.setStatus("Completed referencing HFiles for offline region " + regionInfo.toString() +
287        " of table: " + snapshotTable);
288  }
289
290  @Override
291  public void cancel(String why) {
292    if (finished) return;
293
294    this.finished = true;
295    LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
296        " because: " + why);
297    CancellationException ce = new CancellationException(why);
298    monitor.receive(new ForeignException(master.getServerName().toString(), ce));
299  }
300
301  @Override
302  public boolean isFinished() {
303    return finished;
304  }
305
306  @Override
307  public long getCompletionTimestamp() {
308    return this.status.getCompletionTimestamp();
309  }
310
311  @Override
312  public SnapshotDescription getSnapshot() {
313    return snapshot;
314  }
315
316  @Override
317  public ForeignException getExceptionIfFailed() {
318    return monitor.getException();
319  }
320
321  @Override
322  public void rethrowExceptionIfFailed() throws ForeignException {
323    monitor.rethrowException();
324  }
325
326  @Override
327  public void rethrowException() throws ForeignException {
328    monitor.rethrowException();
329  }
330
331  @Override
332  public boolean hasException() {
333    return monitor.hasException();
334  }
335
336  @Override
337  public ForeignException getException() {
338    return monitor.getException();
339  }
340}