001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.snapshot; 019 020import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_RPC_TIMEOUT; 021import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_TIMEOUT_KEY; 022 023import java.io.IOException; 024import java.util.List; 025import java.util.concurrent.CancellationException; 026import org.apache.hadoop.conf.Configuration; 027import org.apache.hadoop.fs.FileSystem; 028import org.apache.hadoop.fs.Path; 029import org.apache.hadoop.hbase.DoNotRetryIOException; 030import org.apache.hadoop.hbase.ServerName; 031import org.apache.hadoop.hbase.TableName; 032import org.apache.hadoop.hbase.client.RegionInfo; 033import org.apache.hadoop.hbase.client.TableDescriptor; 034import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 035import org.apache.hadoop.hbase.errorhandling.ForeignException; 036import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher; 037import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; 038import org.apache.hadoop.hbase.executor.EventHandler; 039import org.apache.hadoop.hbase.executor.EventType; 040import org.apache.hadoop.hbase.master.MasterServices; 041import org.apache.hadoop.hbase.master.MetricsSnapshot; 042import org.apache.hadoop.hbase.master.SnapshotSentinel; 043import org.apache.hadoop.hbase.master.locking.LockManager; 044import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock; 045import org.apache.hadoop.hbase.monitoring.MonitoredTask; 046import org.apache.hadoop.hbase.monitoring.TaskMonitor; 047import org.apache.hadoop.hbase.procedure2.LockType; 048import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; 049import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; 050import org.apache.hadoop.hbase.snapshot.SnapshotManifest; 051import org.apache.hadoop.hbase.snapshot.SnapshotTTLExpiredException; 052import org.apache.hadoop.hbase.util.CommonFSUtils; 053import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 054import org.apache.hadoop.hbase.util.Pair; 055import org.apache.yetus.audience.InterfaceAudience; 056import org.apache.zookeeper.KeeperException; 057import org.slf4j.Logger; 058import org.slf4j.LoggerFactory; 059 060import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 061 062import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 063import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; 064 065/** 066 * A handler for taking snapshots from the master. This is not a subclass of TableEventHandler 067 * because using that would incur an extra hbase:meta scan. The {@link #snapshotRegions(List)} call 068 * should get implemented for each snapshot flavor. 069 */ 070@InterfaceAudience.Private 071public abstract class TakeSnapshotHandler extends EventHandler 072 implements SnapshotSentinel, ForeignExceptionSnare { 073 private static final Logger LOG = LoggerFactory.getLogger(TakeSnapshotHandler.class); 074 public static final String HBASE_SNAPSHOT_MASTER_LOCK_ACQUIRE_TIMEOUT = 075 "hbase.snapshot.master.lock.acquire.timeout"; 076 077 private volatile boolean finished; 078 079 // none of these should ever be null 080 protected final MasterServices master; 081 protected final MetricsSnapshot metricsSnapshot = new MetricsSnapshot(); 082 protected final SnapshotDescription snapshot; 083 protected final Configuration conf; 084 protected final FileSystem rootFs; 085 protected final FileSystem workingDirFs; 086 protected final Path rootDir; 087 private final Path snapshotDir; 088 protected final Path workingDir; 089 private final MasterSnapshotVerifier verifier; 090 protected final ForeignExceptionDispatcher monitor; 091 private final LockManager.MasterLock tableLock; 092 protected final MonitoredTask status; 093 protected final TableName snapshotTable; 094 protected final SnapshotManifest snapshotManifest; 095 protected final SnapshotManager snapshotManager; 096 /** 097 * Snapshot creation requires table lock. If any region of the table is in transition, table lock 098 * cannot be acquired by LockProcedure and hence snapshot creation could hang for potentially very 099 * long time. This timeout will ensure snapshot creation fails-fast by waiting for only given 100 * timeout. 101 */ 102 private final long lockAcquireTimeoutMs; 103 104 protected TableDescriptor htd; 105 106 /** 107 * @param snapshot descriptor of the snapshot to take 108 * @param masterServices master services provider 109 * @throws IllegalArgumentException if the working snapshot directory set from the configuration 110 * is the same as the completed snapshot directory 111 * @throws IOException if the file system of the working snapshot directory cannot be 112 * determined 113 */ 114 public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices masterServices, 115 final SnapshotManager snapshotManager) throws IOException { 116 super(masterServices, EventType.C_M_SNAPSHOT_TABLE); 117 assert snapshot != null : "SnapshotDescription must not be nul1"; 118 assert masterServices != null : "MasterServices must not be nul1"; 119 this.master = masterServices; 120 this.conf = this.master.getConfiguration(); 121 this.rootDir = this.master.getMasterFileSystem().getRootDir(); 122 this.workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, conf); 123 Preconditions.checkArgument( 124 !SnapshotDescriptionUtils.isSubDirectoryOf(workingDir, rootDir) 125 || SnapshotDescriptionUtils.isWithinDefaultWorkingDir(workingDir, conf), 126 "The working directory " + workingDir + " cannot be in the root directory unless it is " 127 + "within the default working directory"); 128 129 this.snapshot = snapshot; 130 this.snapshotManager = snapshotManager; 131 this.snapshotTable = TableName.valueOf(snapshot.getTable()); 132 this.rootFs = this.master.getMasterFileSystem().getFileSystem(); 133 this.snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir); 134 this.workingDirFs = this.workingDir.getFileSystem(this.conf); 135 this.monitor = new ForeignExceptionDispatcher(snapshot.getName()); 136 137 this.tableLock = master.getLockManager().createMasterLock(snapshotTable, LockType.EXCLUSIVE, 138 this.getClass().getName() + ": take snapshot " + snapshot.getName()); 139 140 // prepare the verify 141 this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs); 142 // update the running tasks 143 this.status = TaskMonitor.get().createStatus( 144 "Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable, false, true); 145 this.snapshotManifest = 146 SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor, status); 147 this.lockAcquireTimeoutMs = conf.getLong(HBASE_SNAPSHOT_MASTER_LOCK_ACQUIRE_TIMEOUT, 148 conf.getLong(HBASE_RPC_TIMEOUT_KEY, DEFAULT_HBASE_RPC_TIMEOUT)); 149 } 150 151 private TableDescriptor loadTableDescriptor() throws IOException { 152 TableDescriptor htd = this.master.getTableDescriptors().get(snapshotTable); 153 if (htd == null) { 154 throw new IOException("TableDescriptor missing for " + snapshotTable); 155 } 156 if (htd.getMaxFileSize() == -1 && this.snapshot.getMaxFileSize() > 0) { 157 htd = TableDescriptorBuilder.newBuilder(htd).setValue(TableDescriptorBuilder.MAX_FILESIZE, 158 Long.toString(this.snapshot.getMaxFileSize())).build(); 159 } 160 return htd; 161 } 162 163 @Override 164 public TakeSnapshotHandler prepare() throws Exception { 165 super.prepare(); 166 // after this, you should ensure to release this lock in case of exceptions 167 if (this.tableLock.tryAcquire(this.lockAcquireTimeoutMs)) { 168 try { 169 this.htd = loadTableDescriptor(); // check that .tableinfo is present 170 } catch (Exception e) { 171 this.tableLock.release(); 172 throw e; 173 } 174 } else { 175 LOG.error("Master lock could not be acquired in {} ms", lockAcquireTimeoutMs); 176 throw new DoNotRetryIOException("Master lock could not be acquired"); 177 } 178 return this; 179 } 180 181 /** 182 * Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)} call 183 * should get implemented for each snapshot flavor. 184 */ 185 @Override 186 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "REC_CATCH_EXCEPTION", 187 justification = "Intentional") 188 public void process() { 189 String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " " 190 + eventType + " on table " + snapshotTable; 191 LOG.info(msg); 192 MasterLock tableLockToRelease = this.tableLock; 193 status.setStatus(msg); 194 try { 195 if (downgradeToSharedTableLock()) { 196 // release the exclusive lock and hold the shared lock instead 197 tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable, 198 LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName()); 199 tableLock.release(); 200 boolean isTableLockAcquired = tableLockToRelease.tryAcquire(this.lockAcquireTimeoutMs); 201 if (!isTableLockAcquired) { 202 LOG.error("Could not acquire shared lock on table {} in {} ms", snapshotTable, 203 lockAcquireTimeoutMs); 204 throw new IOException("Could not acquire shared lock on table " + snapshotTable); 205 } 206 } 207 // If regions move after this meta scan, the region specific snapshot should fail, triggering 208 // an external exception that gets captured here. 209 210 // write down the snapshot info in the working directory 211 SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, workingDirFs); 212 snapshotManifest.addTableDescriptor(this.htd); 213 monitor.rethrowException(); 214 215 List<Pair<RegionInfo, ServerName>> regionsAndLocations = 216 master.getAssignmentManager().getTableRegionsAndLocations(snapshotTable, false); 217 218 // run the snapshot 219 snapshotRegions(regionsAndLocations); 220 monitor.rethrowException(); 221 222 // flush the in-memory state, and write the single manifest 223 status.setStatus("Consolidate snapshot: " + snapshot.getName()); 224 snapshotManifest.consolidate(); 225 226 // verify the snapshot is valid 227 status.setStatus("Verifying snapshot: " + snapshot.getName()); 228 verifier.verifySnapshot(workingDir, true); 229 230 // HBASE-29296 check snapshot is not expired 231 if ( 232 SnapshotDescriptionUtils.isExpiredSnapshot(snapshot.getTtl(), snapshot.getCreationTime(), 233 EnvironmentEdgeManager.currentTime()) 234 ) { 235 throw new SnapshotTTLExpiredException(ProtobufUtil.createSnapshotDesc(snapshot)); 236 } 237 238 // complete the snapshot, atomically moving from tmp to .snapshot dir. 239 SnapshotDescriptionUtils.completeSnapshot(this.snapshotDir, this.workingDir, this.rootFs, 240 this.workingDirFs, this.conf); 241 finished = true; 242 msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed"; 243 status.markComplete(msg); 244 LOG.info(msg); 245 metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime()); 246 if (master.getMasterCoprocessorHost() != null) { 247 master.getMasterCoprocessorHost() 248 .postCompletedSnapshotAction(ProtobufUtil.createSnapshotDesc(snapshot), this.htd); 249 } 250 } catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION 251 status.abort("Failed to complete snapshot " + snapshot.getName() + " on table " 252 + snapshotTable + " because " + e.getMessage()); 253 String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot) 254 + " due to exception:" + e.getMessage(); 255 LOG.error(reason, e); 256 ForeignException ee = new ForeignException(reason, e); 257 monitor.receive(ee); 258 // need to mark this completed to close off and allow cleanup to happen. 259 cancel(reason); 260 } finally { 261 LOG.debug("Launching cleanup of working dir:" + workingDir); 262 try { 263 // if the working dir is still present, the snapshot has failed. it is present we delete 264 // it. 265 if (workingDirFs.exists(workingDir) && !workingDirFs.delete(workingDir, true)) { 266 LOG.error("Couldn't delete snapshot working directory: {}", workingDir); 267 } 268 } catch (IOException e) { 269 LOG.error("Couldn't get or delete snapshot working directory: {}", workingDir, e); 270 } 271 if (LOG.isDebugEnabled()) { 272 LOG.debug("Table snapshot journal : \n" + status.prettyPrintJournal()); 273 } 274 tableLockToRelease.release(); 275 } 276 } 277 278 /** 279 * When taking snapshot, first we must acquire the exclusive table lock to confirm that there are 280 * no ongoing merge/split procedures. But later, we should try our best to release the exclusive 281 * lock as this may hurt the availability, because we need to hold the shared lock when assigning 282 * regions. 283 * <p/> 284 * See HBASE-21480 for more details. 285 */ 286 protected abstract boolean downgradeToSharedTableLock(); 287 288 /** 289 * Snapshot the specified regions 290 */ 291 protected abstract void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions) 292 throws IOException, KeeperException; 293 294 /** 295 * Take a snapshot of the specified disabled region 296 */ 297 protected void snapshotDisabledRegion(final RegionInfo regionInfo) throws IOException { 298 snapshotManifest.addRegion(CommonFSUtils.getTableDir(rootDir, snapshotTable), regionInfo); 299 monitor.rethrowException(); 300 status.setStatus("Completed referencing HFiles for offline region " + regionInfo.toString() 301 + " of table: " + snapshotTable); 302 } 303 304 @Override 305 public void cancel(String why) { 306 if (finished) return; 307 308 this.finished = true; 309 LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) 310 + " because: " + why); 311 CancellationException ce = new CancellationException(why); 312 monitor.receive(new ForeignException(master.getServerName().toString(), ce)); 313 } 314 315 @Override 316 public boolean isFinished() { 317 return finished; 318 } 319 320 @Override 321 public long getCompletionTimestamp() { 322 return this.status.getCompletionTimestamp(); 323 } 324 325 @Override 326 public SnapshotDescription getSnapshot() { 327 return snapshot; 328 } 329 330 @Override 331 public ForeignException getExceptionIfFailed() { 332 return monitor.getException(); 333 } 334 335 @Override 336 public void rethrowExceptionIfFailed() throws ForeignException { 337 monitor.rethrowException(); 338 } 339 340 @Override 341 public void rethrowException() throws ForeignException { 342 monitor.rethrowException(); 343 } 344 345 @Override 346 public boolean hasException() { 347 return monitor.hasException(); 348 } 349 350 @Override 351 public ForeignException getException() { 352 return monitor.getException(); 353 } 354}