001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.snapshot; 019 020import java.io.FileNotFoundException; 021import java.io.IOException; 022import java.net.URI; 023import java.util.HashSet; 024import java.util.List; 025import java.util.Set; 026import java.util.concurrent.CancellationException; 027import org.apache.hadoop.conf.Configuration; 028import org.apache.hadoop.fs.FileSystem; 029import org.apache.hadoop.fs.FileUtil; 030import org.apache.hadoop.fs.Path; 031import org.apache.hadoop.hbase.MetaTableAccessor; 032import org.apache.hadoop.hbase.ServerName; 033import org.apache.hadoop.hbase.TableName; 034import org.apache.hadoop.hbase.client.RegionInfo; 035import org.apache.hadoop.hbase.client.TableDescriptor; 036import org.apache.hadoop.hbase.errorhandling.ForeignException; 037import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher; 038import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; 039import org.apache.hadoop.hbase.executor.EventHandler; 040import org.apache.hadoop.hbase.executor.EventType; 041import org.apache.hadoop.hbase.master.MasterServices; 042import org.apache.hadoop.hbase.master.MetricsSnapshot; 043import org.apache.hadoop.hbase.master.SnapshotSentinel; 044import org.apache.hadoop.hbase.master.locking.LockManager; 045import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock; 046import org.apache.hadoop.hbase.monitoring.MonitoredTask; 047import org.apache.hadoop.hbase.monitoring.TaskMonitor; 048import org.apache.hadoop.hbase.procedure2.LockType; 049import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; 050import org.apache.hadoop.hbase.snapshot.SnapshotCreationException; 051import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; 052import org.apache.hadoop.hbase.snapshot.SnapshotManifest; 053import org.apache.hadoop.hbase.util.FSUtils; 054import org.apache.hadoop.hbase.util.Pair; 055import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; 056import org.apache.yetus.audience.InterfaceAudience; 057import org.apache.zookeeper.KeeperException; 058import org.slf4j.Logger; 059import org.slf4j.LoggerFactory; 060 061import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 062 063import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; 064 065/** 066 * A handler for taking snapshots from the master. 067 * 068 * This is not a subclass of TableEventHandler because using that would incur an extra hbase:meta scan. 069 * 070 * The {@link #snapshotRegions(List)} call should get implemented for each snapshot flavor. 071 */ 072@InterfaceAudience.Private 073public abstract class TakeSnapshotHandler extends EventHandler implements SnapshotSentinel, 074 ForeignExceptionSnare { 075 private static final Logger LOG = LoggerFactory.getLogger(TakeSnapshotHandler.class); 076 077 private volatile boolean finished; 078 079 // none of these should ever be null 080 protected final MasterServices master; 081 protected final MetricsSnapshot metricsSnapshot = new MetricsSnapshot(); 082 protected final SnapshotDescription snapshot; 083 protected final Configuration conf; 084 protected final FileSystem rootFs; 085 protected final FileSystem workingDirFs; 086 protected final Path rootDir; 087 private final Path snapshotDir; 088 protected final Path workingDir; 089 private final MasterSnapshotVerifier verifier; 090 protected final ForeignExceptionDispatcher monitor; 091 private final LockManager.MasterLock tableLock; 092 protected final MonitoredTask status; 093 protected final TableName snapshotTable; 094 protected final SnapshotManifest snapshotManifest; 095 protected final SnapshotManager snapshotManager; 096 097 protected TableDescriptor htd; 098 099 /** 100 * @param snapshot descriptor of the snapshot to take 101 * @param masterServices master services provider 102 * @throws IllegalArgumentException if the working snapshot directory set from the 103 * configuration is the same as the completed snapshot directory 104 * @throws IOException if the file system of the working snapshot directory cannot be 105 * determined 106 */ 107 public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices masterServices, 108 final SnapshotManager snapshotManager) throws IOException { 109 super(masterServices, EventType.C_M_SNAPSHOT_TABLE); 110 assert snapshot != null : "SnapshotDescription must not be nul1"; 111 assert masterServices != null : "MasterServices must not be nul1"; 112 this.master = masterServices; 113 this.conf = this.master.getConfiguration(); 114 this.rootDir = this.master.getMasterFileSystem().getRootDir(); 115 this.workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, conf); 116 Preconditions.checkArgument(!SnapshotDescriptionUtils.isSubDirectoryOf(workingDir, rootDir) || 117 SnapshotDescriptionUtils.isWithinDefaultWorkingDir(workingDir, conf), 118 "The working directory " + workingDir + " cannot be in the root directory unless it is " 119 + "within the default working directory"); 120 121 this.snapshot = snapshot; 122 this.snapshotManager = snapshotManager; 123 this.snapshotTable = TableName.valueOf(snapshot.getTable()); 124 this.rootFs = this.master.getMasterFileSystem().getFileSystem(); 125 this.snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir); 126 this.workingDirFs = this.workingDir.getFileSystem(this.conf); 127 this.monitor = new ForeignExceptionDispatcher(snapshot.getName()); 128 this.snapshotManifest = SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor); 129 130 this.tableLock = master.getLockManager().createMasterLock( 131 snapshotTable, LockType.EXCLUSIVE, 132 this.getClass().getName() + ": take snapshot " + snapshot.getName()); 133 134 // prepare the verify 135 this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs); 136 // update the running tasks 137 this.status = TaskMonitor.get().createStatus( 138 "Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable); 139 } 140 141 private TableDescriptor loadTableDescriptor() 142 throws FileNotFoundException, IOException { 143 TableDescriptor htd = 144 this.master.getTableDescriptors().get(snapshotTable); 145 if (htd == null) { 146 throw new IOException("TableDescriptor missing for " + snapshotTable); 147 } 148 return htd; 149 } 150 151 @Override 152 public TakeSnapshotHandler prepare() throws Exception { 153 super.prepare(); 154 // after this, you should ensure to release this lock in case of exceptions 155 this.tableLock.acquire(); 156 try { 157 this.htd = loadTableDescriptor(); // check that .tableinfo is present 158 } catch (Exception e) { 159 this.tableLock.release(); 160 throw e; 161 } 162 return this; 163 } 164 165 /** 166 * Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)} 167 * call should get implemented for each snapshot flavor. 168 */ 169 @Override 170 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="REC_CATCH_EXCEPTION", 171 justification="Intentional") 172 public void process() { 173 String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " " 174 + eventType + " on table " + snapshotTable; 175 LOG.info(msg); 176 MasterLock tableLockToRelease = this.tableLock; 177 status.setStatus(msg); 178 try { 179 if (downgradeToSharedTableLock()) { 180 // release the exclusive lock and hold the shared lock instead 181 tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable, 182 LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName()); 183 tableLock.release(); 184 tableLockToRelease.acquire(); 185 } 186 // If regions move after this meta scan, the region specific snapshot should fail, triggering 187 // an external exception that gets captured here. 188 189 // write down the snapshot info in the working directory 190 SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, workingDirFs); 191 snapshotManifest.addTableDescriptor(this.htd); 192 monitor.rethrowException(); 193 194 List<Pair<RegionInfo, ServerName>> regionsAndLocations; 195 if (TableName.META_TABLE_NAME.equals(snapshotTable)) { 196 regionsAndLocations = MetaTableLocator.getMetaRegionsAndLocations( 197 server.getZooKeeper()); 198 } else { 199 regionsAndLocations = MetaTableAccessor.getTableRegionsAndLocations( 200 server.getConnection(), snapshotTable, false); 201 } 202 203 // run the snapshot 204 snapshotRegions(regionsAndLocations); 205 monitor.rethrowException(); 206 207 // extract each pair to separate lists 208 Set<String> serverNames = new HashSet<>(); 209 for (Pair<RegionInfo, ServerName> p : regionsAndLocations) { 210 if (p != null && p.getFirst() != null && p.getSecond() != null) { 211 RegionInfo hri = p.getFirst(); 212 if (hri.isOffline() && (hri.isSplit() || hri.isSplitParent())) continue; 213 serverNames.add(p.getSecond().toString()); 214 } 215 } 216 217 // flush the in-memory state, and write the single manifest 218 status.setStatus("Consolidate snapshot: " + snapshot.getName()); 219 snapshotManifest.consolidate(); 220 221 // verify the snapshot is valid 222 status.setStatus("Verifying snapshot: " + snapshot.getName()); 223 verifier.verifySnapshot(this.workingDir, serverNames); 224 225 // complete the snapshot, atomically moving from tmp to .snapshot dir. 226 completeSnapshot(this.snapshotDir, this.workingDir, this.rootFs, this.workingDirFs); 227 msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed"; 228 status.markComplete(msg); 229 LOG.info(msg); 230 metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime()); 231 } catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION 232 status.abort("Failed to complete snapshot " + snapshot.getName() + " on table " + 233 snapshotTable + " because " + e.getMessage()); 234 String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot) 235 + " due to exception:" + e.getMessage(); 236 LOG.error(reason, e); 237 ForeignException ee = new ForeignException(reason, e); 238 monitor.receive(ee); 239 // need to mark this completed to close off and allow cleanup to happen. 240 cancel(reason); 241 } finally { 242 LOG.debug("Launching cleanup of working dir:" + workingDir); 243 try { 244 // if the working dir is still present, the snapshot has failed. it is present we delete 245 // it. 246 if (!workingDirFs.delete(workingDir, true)) { 247 LOG.error("Couldn't delete snapshot working directory:" + workingDir); 248 } 249 } catch (IOException e) { 250 LOG.error("Couldn't delete snapshot working directory:" + workingDir); 251 } 252 tableLockToRelease.release(); 253 } 254 } 255 256 /** 257 * Reset the manager to allow another snapshot to proceed. 258 * Commits the snapshot process by moving the working snapshot 259 * to the finalized filepath 260 * 261 * @param snapshotDir The file path of the completed snapshots 262 * @param workingDir The file path of the in progress snapshots 263 * @param fs The file system of the completed snapshots 264 * @param workingDirFs The file system of the in progress snapshots 265 * 266 * @throws SnapshotCreationException if the snapshot could not be moved 267 * @throws IOException the filesystem could not be reached 268 */ 269 public void completeSnapshot(Path snapshotDir, Path workingDir, FileSystem fs, 270 FileSystem workingDirFs) throws SnapshotCreationException, IOException { 271 LOG.debug("Sentinel is done, just moving the snapshot from " + workingDir + " to " 272 + snapshotDir); 273 // If the working and completed snapshot directory are on the same file system, attempt 274 // to rename the working snapshot directory to the completed location. If that fails, 275 // or the file systems differ, attempt to copy the directory over, throwing an exception 276 // if this fails 277 URI workingURI = workingDirFs.getUri(); 278 URI rootURI = fs.getUri(); 279 if ((!workingURI.getScheme().equals(rootURI.getScheme()) || 280 workingURI.getAuthority() == null || 281 !workingURI.getAuthority().equals(rootURI.getAuthority()) || 282 workingURI.getUserInfo() == null || 283 !workingURI.getUserInfo().equals(rootURI.getUserInfo()) || 284 !fs.rename(workingDir, snapshotDir)) && !FileUtil.copy(workingDirFs, workingDir, fs, 285 snapshotDir, true, true, this.conf)) { 286 throw new SnapshotCreationException("Failed to copy working directory(" + workingDir 287 + ") to completed directory(" + snapshotDir + ")."); 288 } 289 finished = true; 290 } 291 292 /** 293 * When taking snapshot, first we must acquire the exclusive table lock to confirm that there are 294 * no ongoing merge/split procedures. But later, we should try our best to release the exclusive 295 * lock as this may hurt the availability, because we need to hold the shared lock when assigning 296 * regions. 297 * <p/> 298 * See HBASE-21480 for more details. 299 */ 300 protected abstract boolean downgradeToSharedTableLock(); 301 302 /** 303 * Snapshot the specified regions 304 */ 305 protected abstract void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions) 306 throws IOException, KeeperException; 307 308 /** 309 * Take a snapshot of the specified disabled region 310 */ 311 protected void snapshotDisabledRegion(final RegionInfo regionInfo) 312 throws IOException { 313 snapshotManifest.addRegion(FSUtils.getTableDir(rootDir, snapshotTable), regionInfo); 314 monitor.rethrowException(); 315 status.setStatus("Completed referencing HFiles for offline region " + regionInfo.toString() + 316 " of table: " + snapshotTable); 317 } 318 319 @Override 320 public void cancel(String why) { 321 if (finished) return; 322 323 this.finished = true; 324 LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) + 325 " because: " + why); 326 CancellationException ce = new CancellationException(why); 327 monitor.receive(new ForeignException(master.getServerName().toString(), ce)); 328 } 329 330 @Override 331 public boolean isFinished() { 332 return finished; 333 } 334 335 @Override 336 public long getCompletionTimestamp() { 337 return this.status.getCompletionTimestamp(); 338 } 339 340 @Override 341 public SnapshotDescription getSnapshot() { 342 return snapshot; 343 } 344 345 @Override 346 public ForeignException getExceptionIfFailed() { 347 return monitor.getException(); 348 } 349 350 @Override 351 public void rethrowExceptionIfFailed() throws ForeignException { 352 monitor.rethrowException(); 353 } 354 355 @Override 356 public void rethrowException() throws ForeignException { 357 monitor.rethrowException(); 358 } 359 360 @Override 361 public boolean hasException() { 362 return monitor.hasException(); 363 } 364 365 @Override 366 public ForeignException getException() { 367 return monitor.getException(); 368 } 369}