001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.snapshot; 019 020import java.io.IOException; 021import java.util.List; 022import java.util.concurrent.CancellationException; 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.fs.FileSystem; 025import org.apache.hadoop.fs.Path; 026import org.apache.hadoop.hbase.ServerName; 027import org.apache.hadoop.hbase.TableName; 028import org.apache.hadoop.hbase.client.RegionInfo; 029import org.apache.hadoop.hbase.client.TableDescriptor; 030import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 031import org.apache.hadoop.hbase.errorhandling.ForeignException; 032import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher; 033import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; 034import org.apache.hadoop.hbase.executor.EventHandler; 035import org.apache.hadoop.hbase.executor.EventType; 036import org.apache.hadoop.hbase.master.MasterServices; 037import org.apache.hadoop.hbase.master.MetricsSnapshot; 038import org.apache.hadoop.hbase.master.SnapshotSentinel; 039import org.apache.hadoop.hbase.master.locking.LockManager; 040import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock; 041import org.apache.hadoop.hbase.monitoring.MonitoredTask; 042import org.apache.hadoop.hbase.monitoring.TaskMonitor; 043import org.apache.hadoop.hbase.procedure2.LockType; 044import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; 045import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; 046import org.apache.hadoop.hbase.snapshot.SnapshotManifest; 047import org.apache.hadoop.hbase.util.CommonFSUtils; 048import org.apache.hadoop.hbase.util.Pair; 049import org.apache.yetus.audience.InterfaceAudience; 050import org.apache.zookeeper.KeeperException; 051import org.slf4j.Logger; 052import org.slf4j.LoggerFactory; 053 054import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 055 056import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 057import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; 058 059/** 060 * A handler for taking snapshots from the master. This is not a subclass of TableEventHandler 061 * because using that would incur an extra hbase:meta scan. The {@link #snapshotRegions(List)} call 062 * should get implemented for each snapshot flavor. 063 */ 064@InterfaceAudience.Private 065public abstract class TakeSnapshotHandler extends EventHandler 066 implements SnapshotSentinel, ForeignExceptionSnare { 067 private static final Logger LOG = LoggerFactory.getLogger(TakeSnapshotHandler.class); 068 069 private volatile boolean finished; 070 071 // none of these should ever be null 072 protected final MasterServices master; 073 protected final MetricsSnapshot metricsSnapshot = new MetricsSnapshot(); 074 protected final SnapshotDescription snapshot; 075 protected final Configuration conf; 076 protected final FileSystem rootFs; 077 protected final FileSystem workingDirFs; 078 protected final Path rootDir; 079 private final Path snapshotDir; 080 protected final Path workingDir; 081 private final MasterSnapshotVerifier verifier; 082 protected final ForeignExceptionDispatcher monitor; 083 private final LockManager.MasterLock tableLock; 084 protected final MonitoredTask status; 085 protected final TableName snapshotTable; 086 protected final SnapshotManifest snapshotManifest; 087 protected final SnapshotManager snapshotManager; 088 089 protected TableDescriptor htd; 090 091 /** 092 * @param snapshot descriptor of the snapshot to take 093 * @param masterServices master services provider 094 * @throws IllegalArgumentException if the working snapshot directory set from the configuration 095 * is the same as the completed snapshot directory 096 * @throws IOException if the file system of the working snapshot directory cannot be 097 * determined 098 */ 099 public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices masterServices, 100 final SnapshotManager snapshotManager) throws IOException { 101 super(masterServices, EventType.C_M_SNAPSHOT_TABLE); 102 assert snapshot != null : "SnapshotDescription must not be nul1"; 103 assert masterServices != null : "MasterServices must not be nul1"; 104 this.master = masterServices; 105 this.conf = this.master.getConfiguration(); 106 this.rootDir = this.master.getMasterFileSystem().getRootDir(); 107 this.workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, conf); 108 Preconditions.checkArgument( 109 !SnapshotDescriptionUtils.isSubDirectoryOf(workingDir, rootDir) 110 || SnapshotDescriptionUtils.isWithinDefaultWorkingDir(workingDir, conf), 111 "The working directory " + workingDir + " cannot be in the root directory unless it is " 112 + "within the default working directory"); 113 114 this.snapshot = snapshot; 115 this.snapshotManager = snapshotManager; 116 this.snapshotTable = TableName.valueOf(snapshot.getTable()); 117 this.rootFs = this.master.getMasterFileSystem().getFileSystem(); 118 this.snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir); 119 this.workingDirFs = this.workingDir.getFileSystem(this.conf); 120 this.monitor = new ForeignExceptionDispatcher(snapshot.getName()); 121 122 this.tableLock = master.getLockManager().createMasterLock(snapshotTable, LockType.EXCLUSIVE, 123 this.getClass().getName() + ": take snapshot " + snapshot.getName()); 124 125 // prepare the verify 126 this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs); 127 // update the running tasks 128 this.status = TaskMonitor.get().createStatus( 129 "Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable, false, true); 130 this.snapshotManifest = 131 SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor, status); 132 } 133 134 private TableDescriptor loadTableDescriptor() throws IOException { 135 TableDescriptor htd = this.master.getTableDescriptors().get(snapshotTable); 136 if (htd == null) { 137 throw new IOException("TableDescriptor missing for " + snapshotTable); 138 } 139 if (htd.getMaxFileSize() == -1 && this.snapshot.getMaxFileSize() > 0) { 140 htd = TableDescriptorBuilder.newBuilder(htd).setValue(TableDescriptorBuilder.MAX_FILESIZE, 141 Long.toString(this.snapshot.getMaxFileSize())).build(); 142 } 143 return htd; 144 } 145 146 @Override 147 public TakeSnapshotHandler prepare() throws Exception { 148 super.prepare(); 149 // after this, you should ensure to release this lock in case of exceptions 150 this.tableLock.acquire(); 151 try { 152 this.htd = loadTableDescriptor(); // check that .tableinfo is present 153 } catch (Exception e) { 154 this.tableLock.release(); 155 throw e; 156 } 157 return this; 158 } 159 160 /** 161 * Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)} call 162 * should get implemented for each snapshot flavor. 163 */ 164 @Override 165 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "REC_CATCH_EXCEPTION", 166 justification = "Intentional") 167 public void process() { 168 String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " " 169 + eventType + " on table " + snapshotTable; 170 LOG.info(msg); 171 MasterLock tableLockToRelease = this.tableLock; 172 status.setStatus(msg); 173 try { 174 if (downgradeToSharedTableLock()) { 175 // release the exclusive lock and hold the shared lock instead 176 tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable, 177 LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName()); 178 tableLock.release(); 179 tableLockToRelease.acquire(); 180 } 181 // If regions move after this meta scan, the region specific snapshot should fail, triggering 182 // an external exception that gets captured here. 183 184 // write down the snapshot info in the working directory 185 SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, workingDirFs); 186 snapshotManifest.addTableDescriptor(this.htd); 187 monitor.rethrowException(); 188 189 List<Pair<RegionInfo, ServerName>> regionsAndLocations = 190 master.getAssignmentManager().getTableRegionsAndLocations(snapshotTable, false); 191 192 // run the snapshot 193 snapshotRegions(regionsAndLocations); 194 monitor.rethrowException(); 195 196 // flush the in-memory state, and write the single manifest 197 status.setStatus("Consolidate snapshot: " + snapshot.getName()); 198 snapshotManifest.consolidate(); 199 200 // verify the snapshot is valid 201 status.setStatus("Verifying snapshot: " + snapshot.getName()); 202 verifier.verifySnapshot(workingDir, true); 203 204 // complete the snapshot, atomically moving from tmp to .snapshot dir. 205 SnapshotDescriptionUtils.completeSnapshot(this.snapshotDir, this.workingDir, this.rootFs, 206 this.workingDirFs, this.conf); 207 finished = true; 208 msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed"; 209 status.markComplete(msg); 210 LOG.info(msg); 211 metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime()); 212 if (master.getMasterCoprocessorHost() != null) { 213 master.getMasterCoprocessorHost() 214 .postCompletedSnapshotAction(ProtobufUtil.createSnapshotDesc(snapshot), this.htd); 215 } 216 } catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION 217 status.abort("Failed to complete snapshot " + snapshot.getName() + " on table " 218 + snapshotTable + " because " + e.getMessage()); 219 String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot) 220 + " due to exception:" + e.getMessage(); 221 LOG.error(reason, e); 222 ForeignException ee = new ForeignException(reason, e); 223 monitor.receive(ee); 224 // need to mark this completed to close off and allow cleanup to happen. 225 cancel(reason); 226 } finally { 227 LOG.debug("Launching cleanup of working dir:" + workingDir); 228 try { 229 // if the working dir is still present, the snapshot has failed. it is present we delete 230 // it. 231 if (!workingDirFs.delete(workingDir, true)) { 232 LOG.error("Couldn't delete snapshot working directory:" + workingDir); 233 } 234 } catch (IOException e) { 235 LOG.error("Couldn't delete snapshot working directory:" + workingDir); 236 } 237 if (LOG.isDebugEnabled()) { 238 LOG.debug("Table snapshot journal : \n" + status.prettyPrintJournal()); 239 } 240 tableLockToRelease.release(); 241 } 242 } 243 244 /** 245 * When taking snapshot, first we must acquire the exclusive table lock to confirm that there are 246 * no ongoing merge/split procedures. But later, we should try our best to release the exclusive 247 * lock as this may hurt the availability, because we need to hold the shared lock when assigning 248 * regions. 249 * <p/> 250 * See HBASE-21480 for more details. 251 */ 252 protected abstract boolean downgradeToSharedTableLock(); 253 254 /** 255 * Snapshot the specified regions 256 */ 257 protected abstract void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions) 258 throws IOException, KeeperException; 259 260 /** 261 * Take a snapshot of the specified disabled region 262 */ 263 protected void snapshotDisabledRegion(final RegionInfo regionInfo) throws IOException { 264 snapshotManifest.addRegion(CommonFSUtils.getTableDir(rootDir, snapshotTable), regionInfo); 265 monitor.rethrowException(); 266 status.setStatus("Completed referencing HFiles for offline region " + regionInfo.toString() 267 + " of table: " + snapshotTable); 268 } 269 270 @Override 271 public void cancel(String why) { 272 if (finished) return; 273 274 this.finished = true; 275 LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) 276 + " because: " + why); 277 CancellationException ce = new CancellationException(why); 278 monitor.receive(new ForeignException(master.getServerName().toString(), ce)); 279 } 280 281 @Override 282 public boolean isFinished() { 283 return finished; 284 } 285 286 @Override 287 public long getCompletionTimestamp() { 288 return this.status.getCompletionTimestamp(); 289 } 290 291 @Override 292 public SnapshotDescription getSnapshot() { 293 return snapshot; 294 } 295 296 @Override 297 public ForeignException getExceptionIfFailed() { 298 return monitor.getException(); 299 } 300 301 @Override 302 public void rethrowExceptionIfFailed() throws ForeignException { 303 monitor.rethrowException(); 304 } 305 306 @Override 307 public void rethrowException() throws ForeignException { 308 monitor.rethrowException(); 309 } 310 311 @Override 312 public boolean hasException() { 313 return monitor.hasException(); 314 } 315 316 @Override 317 public ForeignException getException() { 318 return monitor.getException(); 319 } 320}