001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.master; 020 021import java.io.IOException; 022import java.util.ArrayList; 023import java.util.Collections; 024import java.util.HashSet; 025import java.util.List; 026import java.util.Set; 027import java.util.concurrent.locks.Lock; 028import java.util.concurrent.locks.ReentrantLock; 029import java.util.stream.Collectors; 030import java.util.stream.Stream; 031import org.apache.hadoop.conf.Configuration; 032import org.apache.hadoop.fs.FileStatus; 033import org.apache.hadoop.fs.FileSystem; 034import org.apache.hadoop.fs.Path; 035import org.apache.hadoop.fs.PathFilter; 036import org.apache.hadoop.hbase.HConstants; 037import org.apache.hadoop.hbase.ServerName; 038import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL; 039import org.apache.hadoop.hbase.util.CommonFSUtils; 040import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 041import org.apache.hadoop.hbase.util.FSUtils; 042import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; 043import org.apache.hadoop.hbase.wal.WALSplitter; 044import org.apache.yetus.audience.InterfaceAudience; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047 048import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; 049 050/** 051 * This class abstracts a bunch of operations the HMaster needs 052 * when splitting log files e.g. finding log files, dirs etc. 053 */ 054@InterfaceAudience.Private 055public class MasterWalManager { 056 private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class); 057 058 final static PathFilter META_FILTER = new PathFilter() { 059 @Override 060 public boolean accept(Path p) { 061 return AbstractFSWALProvider.isMetaFile(p); 062 } 063 }; 064 065 @VisibleForTesting 066 public final static PathFilter NON_META_FILTER = new PathFilter() { 067 @Override 068 public boolean accept(Path p) { 069 return !AbstractFSWALProvider.isMetaFile(p); 070 } 071 }; 072 073 // metrics for master 074 // TODO: Rename it, since those metrics are split-manager related 075 private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem(); 076 077 // Keep around for convenience. 078 private final MasterServices services; 079 private final Configuration conf; 080 private final FileSystem fs; 081 082 // The Path to the old logs dir 083 private final Path oldLogDir; 084 private final Path rootDir; 085 086 // create the split log lock 087 private final Lock splitLogLock = new ReentrantLock(); 088 private final SplitLogManager splitLogManager; 089 090 // Is the fileystem ok? 091 private volatile boolean fsOk = true; 092 093 public MasterWalManager(MasterServices services) throws IOException { 094 this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(), services); 095 } 096 097 public MasterWalManager(Configuration conf, FileSystem fs, MasterServices services) 098 throws IOException { 099 this.fs = fs; 100 this.conf = conf; 101 this.rootDir = CommonFSUtils.getWALRootDir(conf); 102 this.services = services; 103 this.splitLogManager = new SplitLogManager(services, conf); 104 105 this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME); 106 } 107 108 public void stop() { 109 if (splitLogManager != null) { 110 splitLogManager.stop(); 111 } 112 } 113 114 @VisibleForTesting 115 SplitLogManager getSplitLogManager() { 116 return this.splitLogManager; 117 } 118 119 /** 120 * Get the directory where old logs go 121 * @return the dir 122 */ 123 Path getOldLogDir() { 124 return this.oldLogDir; 125 } 126 127 public FileSystem getFileSystem() { 128 return this.fs; 129 } 130 131 /** 132 * Checks to see if the file system is still accessible. 133 * If not, sets closed 134 * @return false if file system is not available 135 */ 136 private boolean checkFileSystem() { 137 if (this.fsOk) { 138 try { 139 FSUtils.checkFileSystemAvailable(this.fs); 140 FSUtils.checkDfsSafeMode(this.conf); 141 } catch (IOException e) { 142 services.abort("Shutting down HBase cluster: file system not available", e); 143 this.fsOk = false; 144 } 145 } 146 return this.fsOk; 147 } 148 149 /** 150 * Get Servernames which are currently splitting; paths have a '-splitting' suffix. 151 * @return ServerName 152 * @throws IOException IOException 153 */ 154 public Set<ServerName> getSplittingServersFromWALDir() throws IOException { 155 return getServerNamesFromWALDirPath( 156 p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); 157 } 158 159 /** 160 * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these 161 * are already being split -- they cannot be 'alive'. 162 * @return ServerName 163 * @throws IOException IOException 164 */ 165 public Set<ServerName> getLiveServersFromWALDir() throws IOException { 166 return getServerNamesFromWALDirPath( 167 p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); 168 } 169 170 /** 171 * @return listing of ServerNames found by parsing WAL directory paths in FS. 172 */ 173 public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException { 174 FileStatus[] walDirForServerNames = getWALDirPaths(filter); 175 return Stream.of(walDirForServerNames).map(s -> { 176 ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath()); 177 if (serverName == null) { 178 LOG.warn("Log folder {} doesn't look like its name includes a " + 179 "region server name; leaving in place. If you see later errors about missing " + 180 "write ahead logs they may be saved in this location.", s.getPath()); 181 return null; 182 } 183 return serverName; 184 }).filter(s -> s != null).collect(Collectors.toSet()); 185 } 186 187 /** 188 * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME. 189 */ 190 public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException { 191 Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME); 192 FileStatus[] walDirForServerNames = CommonFSUtils.listStatus(fs, walDirPath, filter); 193 return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames; 194 } 195 196 /** 197 * Inspect the log directory to find dead servers which need recovery work 198 * @return A set of ServerNames which aren't running but still have WAL files left in file system 199 * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need 200 * to scan the wal directory to find out the splitting wal directory any more. Leave 201 * it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses 202 * it. 203 */ 204 @Deprecated 205 public Set<ServerName> getFailedServersFromLogFolders() throws IOException { 206 boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors", 207 WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT); 208 209 Set<ServerName> serverNames = new HashSet<>(); 210 Path logsDirPath = new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME); 211 212 do { 213 if (services.isStopped()) { 214 LOG.warn("Master stopped while trying to get failed servers."); 215 break; 216 } 217 try { 218 if (!this.fs.exists(logsDirPath)) return serverNames; 219 FileStatus[] logFolders = CommonFSUtils.listStatus(this.fs, logsDirPath, null); 220 // Get online servers after getting log folders to avoid log folder deletion of newly 221 // checked in region servers . see HBASE-5916 222 Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet(); 223 224 if (logFolders == null || logFolders.length == 0) { 225 LOG.debug("No log files to split, proceeding..."); 226 return serverNames; 227 } 228 for (FileStatus status : logFolders) { 229 FileStatus[] curLogFiles = CommonFSUtils.listStatus(this.fs, status.getPath(), null); 230 if (curLogFiles == null || curLogFiles.length == 0) { 231 // Empty log folder. No recovery needed 232 continue; 233 } 234 final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName( 235 status.getPath()); 236 if (null == serverName) { 237 LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " + 238 "region server name; leaving in place. If you see later errors about missing " + 239 "write ahead logs they may be saved in this location."); 240 } else if (!onlineServers.contains(serverName)) { 241 LOG.info("Log folder " + status.getPath() + " doesn't belong " 242 + "to a known region server, splitting"); 243 serverNames.add(serverName); 244 } else { 245 LOG.info("Log folder " + status.getPath() + " belongs to an existing region server"); 246 } 247 } 248 retrySplitting = false; 249 } catch (IOException ioe) { 250 LOG.warn("Failed getting failed servers to be recovered.", ioe); 251 if (!checkFileSystem()) { 252 LOG.warn("Bad Filesystem, exiting"); 253 Runtime.getRuntime().halt(1); 254 } 255 try { 256 if (retrySplitting) { 257 Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000)); 258 } 259 } catch (InterruptedException e) { 260 LOG.warn("Interrupted, aborting since cannot return w/o splitting"); 261 Thread.currentThread().interrupt(); 262 retrySplitting = false; 263 Runtime.getRuntime().halt(1); 264 } 265 } 266 } while (retrySplitting); 267 268 return serverNames; 269 } 270 271 public void splitLog(final ServerName serverName) throws IOException { 272 splitLog(Collections.<ServerName>singleton(serverName)); 273 } 274 275 /** 276 * Specialized method to handle the splitting for meta WAL 277 * @param serverName logs belonging to this server will be split 278 */ 279 public void splitMetaLog(final ServerName serverName) throws IOException { 280 splitMetaLog(Collections.<ServerName>singleton(serverName)); 281 } 282 283 /** 284 * Specialized method to handle the splitting for meta WAL 285 * @param serverNames logs belonging to these servers will be split 286 */ 287 public void splitMetaLog(final Set<ServerName> serverNames) throws IOException { 288 splitLog(serverNames, META_FILTER); 289 } 290 291 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification= 292 "We only release this lock when we set it. Updates to code that uses it should verify use " + 293 "of the guard boolean.") 294 List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException { 295 List<Path> logDirs = new ArrayList<>(); 296 boolean needReleaseLock = false; 297 if (!this.services.isInitialized()) { 298 // during master initialization, we could have multiple places splitting a same wal 299 // XXX: Does this still exist after we move to proc-v2? 300 this.splitLogLock.lock(); 301 needReleaseLock = true; 302 } 303 try { 304 for (ServerName serverName : serverNames) { 305 Path logDir = new Path(this.rootDir, 306 AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); 307 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 308 // Rename the directory so a rogue RS doesn't create more WALs 309 if (fs.exists(logDir)) { 310 if (!this.fs.rename(logDir, splitDir)) { 311 throw new IOException("Failed fs.rename for log split: " + logDir); 312 } 313 logDir = splitDir; 314 LOG.debug("Renamed region directory: " + splitDir); 315 } else if (!fs.exists(splitDir)) { 316 LOG.info("Log dir for server " + serverName + " does not exist"); 317 continue; 318 } 319 logDirs.add(splitDir); 320 } 321 } catch (IOException ioe) { 322 if (!checkFileSystem()) { 323 this.services.abort("Aborting due to filesystem unavailable", ioe); 324 throw ioe; 325 } 326 } finally { 327 if (needReleaseLock) { 328 this.splitLogLock.unlock(); 329 } 330 } 331 return logDirs; 332 } 333 334 public void splitLog(final Set<ServerName> serverNames) throws IOException { 335 splitLog(serverNames, NON_META_FILTER); 336 } 337 338 /** 339 * This method is the base split method that splits WAL files matching a filter. Callers should 340 * pass the appropriate filter for meta and non-meta WALs. 341 * @param serverNames logs belonging to these servers will be split; this will rename the log 342 * directory out from under a soft-failed server 343 */ 344 public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException { 345 long splitTime = 0, splitLogSize = 0; 346 List<Path> logDirs = getLogDirs(serverNames); 347 348 splitLogManager.handleDeadWorkers(serverNames); 349 splitTime = EnvironmentEdgeManager.currentTime(); 350 splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter); 351 splitTime = EnvironmentEdgeManager.currentTime() - splitTime; 352 353 if (this.metricsMasterFilesystem != null) { 354 if (filter == META_FILTER) { 355 this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize); 356 } else { 357 this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize); 358 } 359 } 360 } 361 362 /** 363 * For meta region open and closed normally on a server, it may leave some meta 364 * WAL in the server's wal dir. Since meta region is no long on this server, 365 * The SCP won't split those meta wals, just leaving them there. So deleting 366 * the wal dir will fail since the dir is not empty. Actually We can safely achive those 367 * meta log and Archiving the meta log and delete the dir. 368 * @param serverName the server to archive meta log 369 */ 370 public void archiveMetaLog(final ServerName serverName) { 371 try { 372 Path logDir = new Path(this.rootDir, 373 AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); 374 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 375 if (fs.exists(splitDir)) { 376 FileStatus[] logfiles = CommonFSUtils.listStatus(fs, splitDir, META_FILTER); 377 if (logfiles != null) { 378 for (FileStatus status : logfiles) { 379 if (!status.isDir()) { 380 Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir, 381 status.getPath()); 382 if (!CommonFSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) { 383 LOG.warn("Unable to move " + status.getPath() + " to " + newPath); 384 } else { 385 LOG.debug("Archived meta log " + status.getPath() + " to " + newPath); 386 } 387 } 388 } 389 } 390 if (!fs.delete(splitDir, false)) { 391 LOG.warn("Unable to delete log dir. Ignoring. " + splitDir); 392 } 393 } 394 } catch (IOException ie) { 395 LOG.warn("Failed archiving meta log for server " + serverName, ie); 396 } 397 } 398 399 400}