001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import java.io.IOException; 021import java.util.ArrayList; 022import java.util.Collections; 023import java.util.HashSet; 024import java.util.List; 025import java.util.Set; 026import java.util.concurrent.locks.Lock; 027import java.util.concurrent.locks.ReentrantLock; 028import java.util.stream.Collectors; 029import java.util.stream.Stream; 030import org.apache.hadoop.conf.Configuration; 031import org.apache.hadoop.fs.FileStatus; 032import org.apache.hadoop.fs.FileSystem; 033import org.apache.hadoop.fs.Path; 034import org.apache.hadoop.fs.PathFilter; 035import org.apache.hadoop.hbase.HConstants; 036import org.apache.hadoop.hbase.ServerName; 037import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL; 038import org.apache.hadoop.hbase.util.CommonFSUtils; 039import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 040import org.apache.hadoop.hbase.util.FSUtils; 041import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; 042import org.apache.hadoop.hbase.wal.WALSplitter; 043import org.apache.yetus.audience.InterfaceAudience; 044import org.slf4j.Logger; 045import org.slf4j.LoggerFactory; 046 047/** 048 * This class abstracts a bunch of operations the HMaster needs when splitting log files e.g. 049 * finding log files, dirs etc. 050 */ 051@InterfaceAudience.Private 052public class MasterWalManager { 053 private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class); 054 055 /** 056 * Filter *in* WAL files that are for the hbase:meta Region. 057 */ 058 final static PathFilter META_FILTER = new PathFilter() { 059 @Override 060 public boolean accept(Path p) { 061 return AbstractFSWALProvider.isMetaFile(p); 062 } 063 }; 064 065 /** 066 * Filter *out* WAL files that are for the hbase:meta Region; i.e. return user-space WALs only. 067 */ 068 public final static PathFilter NON_META_FILTER = new PathFilter() { 069 @Override 070 public boolean accept(Path p) { 071 return !AbstractFSWALProvider.isMetaFile(p); 072 } 073 }; 074 075 // metrics for master 076 // TODO: Rename it, since those metrics are split-manager related 077 private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem(); 078 079 // Keep around for convenience. 080 private final MasterServices services; 081 private final Configuration conf; 082 private final FileSystem fs; 083 084 // The Path to the old logs dir 085 private final Path oldLogDir; 086 087 private final Path rootDir; 088 089 // create the split log lock 090 private final Lock splitLogLock = new ReentrantLock(); 091 092 // old WALs directory size in bytes 093 private long oldWALsDirSize; 094 095 /** 096 * Superceded by {@link SplitWALManager}; i.e. procedure-based WAL splitting rather than 'classic' 097 * zk-coordinated WAL splitting. 098 * @deprecated since 2.3.0 and 3.0.0 to be removed in 4.0.0; replaced by {@link SplitWALManager}. 099 * @see SplitWALManager 100 */ 101 @Deprecated 102 private final SplitLogManager splitLogManager; 103 104 // Is the fileystem ok? 105 private volatile boolean fsOk = true; 106 107 public MasterWalManager(MasterServices services) throws IOException { 108 this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(), 109 services.getMasterFileSystem().getWALRootDir(), services); 110 } 111 112 public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterServices services) 113 throws IOException { 114 this.fs = fs; 115 this.conf = conf; 116 this.rootDir = rootDir; 117 this.services = services; 118 this.splitLogManager = new SplitLogManager(services, conf); 119 this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME); 120 this.oldWALsDirSize = 0; 121 } 122 123 public void stop() { 124 if (splitLogManager != null) { 125 splitLogManager.stop(); 126 } 127 } 128 129 SplitLogManager getSplitLogManager() { 130 return this.splitLogManager; 131 } 132 133 /** 134 * Get the directory where old logs go 135 * @return the dir 136 */ 137 Path getOldLogDir() { 138 return this.oldLogDir; 139 } 140 141 public void updateOldWALsDirSize() throws IOException { 142 this.oldWALsDirSize = fs.getContentSummary(this.oldLogDir).getLength(); 143 } 144 145 public long getOldWALsDirSize() { 146 return this.oldWALsDirSize; 147 } 148 149 public FileSystem getFileSystem() { 150 return this.fs; 151 } 152 153 /** 154 * Checks to see if the file system is still accessible. If not, sets closed 155 * @return false if file system is not available 156 */ 157 private boolean checkFileSystem() { 158 if (this.fsOk) { 159 try { 160 FSUtils.checkFileSystemAvailable(this.fs); 161 FSUtils.checkDfsSafeMode(this.conf); 162 } catch (IOException e) { 163 services.abort("Shutting down HBase cluster: file system not available", e); 164 this.fsOk = false; 165 } 166 } 167 return this.fsOk; 168 } 169 170 /** 171 * Get Servernames which are currently splitting; paths have a '-splitting' suffix. 172 */ 173 public Set<ServerName> getSplittingServersFromWALDir() throws IOException { 174 return getServerNamesFromWALDirPath( 175 p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); 176 } 177 178 /** 179 * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these 180 * are already being split -- they cannot be 'alive'. 181 */ 182 public Set<ServerName> getLiveServersFromWALDir() throws IOException { 183 return getServerNamesFromWALDirPath( 184 p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); 185 } 186 187 /** Returns listing of ServerNames found by parsing WAL directory paths in FS. */ 188 public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException { 189 FileStatus[] walDirForServerNames = getWALDirPaths(filter); 190 return Stream.of(walDirForServerNames).map(s -> { 191 ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath()); 192 if (serverName == null) { 193 LOG.warn("Log folder {} doesn't look like its name includes a " 194 + "region server name; leaving in place. If you see later errors about missing " 195 + "write ahead logs they may be saved in this location.", s.getPath()); 196 return null; 197 } 198 return serverName; 199 }).filter(s -> s != null).collect(Collectors.toSet()); 200 } 201 202 /** 203 * Returns List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME. 204 */ 205 public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException { 206 Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME); 207 FileStatus[] walDirForServerNames = CommonFSUtils.listStatus(fs, walDirPath, filter); 208 return walDirForServerNames == null ? new FileStatus[0] : walDirForServerNames; 209 } 210 211 /** 212 * Inspect the log directory to find dead servers which need recovery work 213 * @return A set of ServerNames which aren't running but still have WAL files left in file system 214 * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need 215 * to scan the wal directory to find out the splitting wal directory any more. Leave 216 * it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses 217 * it. 218 */ 219 @Deprecated 220 public Set<ServerName> getFailedServersFromLogFolders() throws IOException { 221 boolean retrySplitting = 222 !conf.getBoolean(WALSplitter.SPLIT_SKIP_ERRORS_KEY, WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT); 223 224 Set<ServerName> serverNames = new HashSet<>(); 225 Path logsDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME); 226 227 do { 228 if (services.isStopped()) { 229 LOG.warn("Master stopped while trying to get failed servers."); 230 break; 231 } 232 try { 233 if (!this.fs.exists(logsDirPath)) return serverNames; 234 FileStatus[] logFolders = CommonFSUtils.listStatus(this.fs, logsDirPath, null); 235 // Get online servers after getting log folders to avoid log folder deletion of newly 236 // checked in region servers . see HBASE-5916 237 Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet(); 238 239 if (logFolders == null || logFolders.length == 0) { 240 LOG.debug("No log files to split, proceeding..."); 241 return serverNames; 242 } 243 for (FileStatus status : logFolders) { 244 FileStatus[] curLogFiles = CommonFSUtils.listStatus(this.fs, status.getPath(), null); 245 if (curLogFiles == null || curLogFiles.length == 0) { 246 // Empty log folder. No recovery needed 247 continue; 248 } 249 final ServerName serverName = 250 AbstractFSWALProvider.getServerNameFromWALDirectoryName(status.getPath()); 251 if (null == serverName) { 252 LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " 253 + "region server name; leaving in place. If you see later errors about missing " 254 + "write ahead logs they may be saved in this location."); 255 } else if (!onlineServers.contains(serverName)) { 256 LOG.info("Log folder " + status.getPath() + " doesn't belong " 257 + "to a known region server, splitting"); 258 serverNames.add(serverName); 259 } else { 260 LOG.info("Log folder " + status.getPath() + " belongs to an existing region server"); 261 } 262 } 263 retrySplitting = false; 264 } catch (IOException ioe) { 265 LOG.warn("Failed getting failed servers to be recovered.", ioe); 266 if (!checkFileSystem()) { 267 LOG.warn("Bad Filesystem, exiting"); 268 Runtime.getRuntime().halt(1); 269 } 270 try { 271 if (retrySplitting) { 272 Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000)); 273 } 274 } catch (InterruptedException e) { 275 LOG.warn("Interrupted, aborting since cannot return w/o splitting"); 276 Thread.currentThread().interrupt(); 277 retrySplitting = false; 278 Runtime.getRuntime().halt(1); 279 } 280 } 281 } while (retrySplitting); 282 283 return serverNames; 284 } 285 286 public void splitLog(final ServerName serverName) throws IOException { 287 splitLog(Collections.<ServerName> singleton(serverName)); 288 } 289 290 /** 291 * Specialized method to handle the splitting for meta WAL 292 * @param serverName logs belonging to this server will be split 293 */ 294 public void splitMetaLog(final ServerName serverName) throws IOException { 295 splitMetaLog(Collections.<ServerName> singleton(serverName)); 296 } 297 298 /** 299 * Specialized method to handle the splitting for meta WAL 300 * @param serverNames logs belonging to these servers will be split 301 */ 302 public void splitMetaLog(final Set<ServerName> serverNames) throws IOException { 303 splitLog(serverNames, META_FILTER); 304 } 305 306 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UL_UNRELEASED_LOCK", 307 justification = "We only release this lock when we set it. Updates to code " 308 + "that uses it should verify use of the guard boolean.") 309 List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException { 310 List<Path> logDirs = new ArrayList<>(); 311 boolean needReleaseLock = false; 312 if (!this.services.isInitialized()) { 313 // during master initialization, we could have multiple places splitting a same wal 314 // XXX: Does this still exist after we move to proc-v2? 315 this.splitLogLock.lock(); 316 needReleaseLock = true; 317 } 318 try { 319 for (ServerName serverName : serverNames) { 320 Path logDir = 321 new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); 322 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 323 // Rename the directory so a rogue RS doesn't create more WALs 324 if (fs.exists(logDir)) { 325 if (!this.fs.rename(logDir, splitDir)) { 326 throw new IOException("Failed fs.rename for log split: " + logDir); 327 } 328 logDir = splitDir; 329 LOG.debug("Renamed region directory: " + splitDir); 330 } else if (!fs.exists(splitDir)) { 331 LOG.info("Log dir for server " + serverName + " does not exist"); 332 continue; 333 } 334 logDirs.add(splitDir); 335 } 336 } catch (IOException ioe) { 337 if (!checkFileSystem()) { 338 this.services.abort("Aborting due to filesystem unavailable", ioe); 339 throw ioe; 340 } 341 } finally { 342 if (needReleaseLock) { 343 this.splitLogLock.unlock(); 344 } 345 } 346 return logDirs; 347 } 348 349 public void splitLog(final Set<ServerName> serverNames) throws IOException { 350 splitLog(serverNames, NON_META_FILTER); 351 } 352 353 /** 354 * This method is the base split method that splits WAL files matching a filter. Callers should 355 * pass the appropriate filter for meta and non-meta WALs. 356 * @param serverNames logs belonging to these servers will be split; this will rename the log 357 * directory out from under a soft-failed server 358 */ 359 public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException { 360 long splitTime = 0, splitLogSize = 0; 361 List<Path> logDirs = getLogDirs(serverNames); 362 363 splitLogManager.handleDeadWorkers(serverNames); 364 splitTime = EnvironmentEdgeManager.currentTime(); 365 splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter); 366 splitTime = EnvironmentEdgeManager.currentTime() - splitTime; 367 368 if (this.metricsMasterFilesystem != null) { 369 if (filter == META_FILTER) { 370 this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize); 371 } else { 372 this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize); 373 } 374 } 375 } 376 377 /** 378 * The hbase:meta region may OPEN and CLOSE without issue on a server and then move elsewhere. On 379 * CLOSE, the WAL for the hbase:meta table may not be archived yet (The WAL is only needed if 380 * hbase:meta did not close cleanaly). Since meta region is no long on this server, the 381 * ServerCrashProcedure won't split these leftover hbase:meta WALs, just leaving them in the WAL 382 * splitting dir. If we try to delete the WAL splitting for the server, it fail since the dir is 383 * not totally empty. We can safely archive these hbase:meta log; then the WAL dir can be deleted. 384 * @param serverName the server to archive meta log 385 */ 386 public void archiveMetaLog(final ServerName serverName) { 387 try { 388 Path logDir = 389 new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); 390 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 391 if (fs.exists(splitDir)) { 392 FileStatus[] logfiles = CommonFSUtils.listStatus(fs, splitDir, META_FILTER); 393 if (logfiles != null) { 394 for (FileStatus status : logfiles) { 395 if (!status.isDir()) { 396 Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir, status.getPath()); 397 if (!CommonFSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) { 398 LOG.warn("Unable to move " + status.getPath() + " to " + newPath); 399 } else { 400 LOG.debug("Archived meta log " + status.getPath() + " to " + newPath); 401 } 402 } 403 } 404 } 405 if (!fs.delete(splitDir, false)) { 406 LOG.warn("Unable to delete log dir. Ignoring. " + splitDir); 407 } 408 } 409 } catch (IOException ie) { 410 LOG.warn("Failed archiving meta log for server " + serverName, ie); 411 } 412 } 413}