001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.master; 020 021import java.io.IOException; 022import java.util.ArrayList; 023import java.util.Collections; 024import java.util.HashSet; 025import java.util.List; 026import java.util.Set; 027import java.util.concurrent.locks.Lock; 028import java.util.concurrent.locks.ReentrantLock; 029import java.util.stream.Collectors; 030import java.util.stream.Stream; 031import org.apache.hadoop.conf.Configuration; 032import org.apache.hadoop.fs.FileStatus; 033import org.apache.hadoop.fs.FileSystem; 034import org.apache.hadoop.fs.Path; 035import org.apache.hadoop.fs.PathFilter; 036import org.apache.hadoop.hbase.HConstants; 037import org.apache.hadoop.hbase.ServerName; 038import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL; 039import org.apache.hadoop.hbase.util.CommonFSUtils; 040import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 041import org.apache.hadoop.hbase.util.FSUtils; 042import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; 043import org.apache.hadoop.hbase.wal.WALSplitter; 044import org.apache.yetus.audience.InterfaceAudience; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; 048 049/** 050 * This class abstracts a bunch of operations the HMaster needs 051 * when splitting log files e.g. finding log files, dirs etc. 052 */ 053@InterfaceAudience.Private 054public class MasterWalManager { 055 private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class); 056 057 final static PathFilter META_FILTER = new PathFilter() { 058 @Override 059 public boolean accept(Path p) { 060 return AbstractFSWALProvider.isMetaFile(p); 061 } 062 }; 063 064 final static PathFilter NON_META_FILTER = new PathFilter() { 065 @Override 066 public boolean accept(Path p) { 067 return !AbstractFSWALProvider.isMetaFile(p); 068 } 069 }; 070 071 // metrics for master 072 // TODO: Rename it, since those metrics are split-manager related 073 private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem(); 074 075 // Keep around for convenience. 076 private final MasterServices services; 077 private final Configuration conf; 078 private final FileSystem fs; 079 080 // The Path to the old logs dir 081 private final Path oldLogDir; 082 083 /** 084 * This is the hbase rootdir. 085 * We'll put the WALs under this dir. 086 */ 087 private final Path rootDir; 088 089 // create the split log lock 090 private final Lock splitLogLock = new ReentrantLock(); 091 private final SplitLogManager splitLogManager; 092 093 // Is the fileystem ok? 094 private volatile boolean fsOk = true; 095 096 public MasterWalManager(MasterServices services) throws IOException { 097 this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(), services); 098 } 099 100 public MasterWalManager(Configuration conf, FileSystem fs, MasterServices services) 101 throws IOException { 102 this.fs = fs; 103 this.conf = conf; 104 this.rootDir = CommonFSUtils.getWALRootDir(conf); 105 this.services = services; 106 this.splitLogManager = new SplitLogManager(services, conf); 107 108 this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME); 109 } 110 111 public void stop() { 112 if (splitLogManager != null) { 113 splitLogManager.stop(); 114 } 115 } 116 117 @VisibleForTesting 118 SplitLogManager getSplitLogManager() { 119 return this.splitLogManager; 120 } 121 122 /** 123 * Get the directory where old logs go 124 * @return the dir 125 */ 126 Path getOldLogDir() { 127 return this.oldLogDir; 128 } 129 130 public FileSystem getFileSystem() { 131 return this.fs; 132 } 133 134 /** 135 * Checks to see if the file system is still accessible. 136 * If not, sets closed 137 * @return false if file system is not available 138 */ 139 private boolean checkFileSystem() { 140 if (this.fsOk) { 141 try { 142 FSUtils.checkFileSystemAvailable(this.fs); 143 FSUtils.checkDfsSafeMode(this.conf); 144 } catch (IOException e) { 145 services.abort("Shutting down HBase cluster: file system not available", e); 146 this.fsOk = false; 147 } 148 } 149 return this.fsOk; 150 } 151 152 /** 153 * Get Servernames which are currently splitting; paths have a '-splitting' suffix. 154 * @return ServerName 155 * @throws IOException IOException 156 */ 157 public Set<ServerName> getSplittingServersFromWALDir() throws IOException { 158 return getServerNamesFromWALDirPath( 159 p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); 160 } 161 162 /** 163 * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these 164 * are already being split -- they cannot be 'alive'. 165 * @return ServerName 166 * @throws IOException IOException 167 */ 168 public Set<ServerName> getLiveServersFromWALDir() throws IOException { 169 return getServerNamesFromWALDirPath( 170 p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); 171 } 172 173 /** 174 * @return listing of ServerNames found by parsing WAL directory paths in FS. 175 * 176 */ 177 public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException { 178 FileStatus[] walDirForServerNames = getWALDirPaths(filter); 179 return Stream.of(walDirForServerNames).map(s -> { 180 ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath()); 181 if (serverName == null) { 182 LOG.warn("Log folder {} doesn't look like its name includes a " + 183 "region server name; leaving in place. If you see later errors about missing " + 184 "write ahead logs they may be saved in this location.", s.getPath()); 185 return null; 186 } 187 return serverName; 188 }).filter(s -> s != null).collect(Collectors.toSet()); 189 } 190 191 /** 192 * @return Returns the WALs dir under <code>rootDir</code> 193 * @throws IOException 194 */ 195 Path getWALDirPath() throws IOException { 196 return new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME); 197 } 198 199 /** 200 * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME. 201 */ 202 public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException { 203 Path walDirPath = getWALDirPath(); 204 FileStatus[] walDirForServerNames = FSUtils.listStatus(fs, walDirPath, filter); 205 return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames; 206 } 207 208 /** 209 * Inspect the log directory to find dead servers which need recovery work 210 * @return A set of ServerNames which aren't running but still have WAL files left in file system 211 * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need 212 * to scan the wal directory to find out the splitting wal directory any more. Leave 213 * it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses 214 * it. 215 */ 216 @Deprecated 217 public Set<ServerName> getFailedServersFromLogFolders() throws IOException { 218 boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors", 219 WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT); 220 221 Set<ServerName> serverNames = new HashSet<>(); 222 Path logsDirPath = getWALDirPath(); 223 224 do { 225 if (services.isStopped()) { 226 LOG.warn("Master stopped while trying to get failed servers."); 227 break; 228 } 229 try { 230 if (!this.fs.exists(logsDirPath)) return serverNames; 231 FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null); 232 // Get online servers after getting log folders to avoid log folder deletion of newly 233 // checked in region servers . see HBASE-5916 234 Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet(); 235 236 if (logFolders == null || logFolders.length == 0) { 237 LOG.debug("No log files to split, proceeding..."); 238 return serverNames; 239 } 240 for (FileStatus status : logFolders) { 241 FileStatus[] curLogFiles = FSUtils.listStatus(this.fs, status.getPath(), null); 242 if (curLogFiles == null || curLogFiles.length == 0) { 243 // Empty log folder. No recovery needed 244 continue; 245 } 246 final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName( 247 status.getPath()); 248 if (null == serverName) { 249 LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " + 250 "region server name; leaving in place. If you see later errors about missing " + 251 "write ahead logs they may be saved in this location."); 252 } else if (!onlineServers.contains(serverName)) { 253 LOG.info("Log folder " + status.getPath() + " doesn't belong " 254 + "to a known region server, splitting"); 255 serverNames.add(serverName); 256 } else { 257 LOG.info("Log folder " + status.getPath() + " belongs to an existing region server"); 258 } 259 } 260 retrySplitting = false; 261 } catch (IOException ioe) { 262 LOG.warn("Failed getting failed servers to be recovered.", ioe); 263 if (!checkFileSystem()) { 264 LOG.warn("Bad Filesystem, exiting"); 265 Runtime.getRuntime().halt(1); 266 } 267 try { 268 if (retrySplitting) { 269 Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000)); 270 } 271 } catch (InterruptedException e) { 272 LOG.warn("Interrupted, aborting since cannot return w/o splitting"); 273 Thread.currentThread().interrupt(); 274 retrySplitting = false; 275 Runtime.getRuntime().halt(1); 276 } 277 } 278 } while (retrySplitting); 279 280 return serverNames; 281 } 282 283 public void splitLog(final ServerName serverName) throws IOException { 284 splitLog(Collections.<ServerName>singleton(serverName)); 285 } 286 287 /** 288 * Specialized method to handle the splitting for meta WAL 289 * @param serverName logs belonging to this server will be split 290 */ 291 public void splitMetaLog(final ServerName serverName) throws IOException { 292 splitMetaLog(Collections.<ServerName>singleton(serverName)); 293 } 294 295 /** 296 * Specialized method to handle the splitting for meta WAL 297 * @param serverNames logs belonging to these servers will be split 298 */ 299 public void splitMetaLog(final Set<ServerName> serverNames) throws IOException { 300 splitLog(serverNames, META_FILTER); 301 } 302 303 /** 304 * @return True if a WAL directory exists (will return true also if WALs found in 305 * servername'-splitting' too). 306 */ 307 boolean isWALDirectoryNameWithWALs(ServerName serverName) { 308 FileStatus [] fss = null; 309 try { 310 // 'startsWith' will also return dirs ending in AbstractFSWALProvider.SPLITTING_EXT 311 fss = getWALDirPaths(p -> p.getName().startsWith(serverName.toString())); 312 } catch (IOException ioe) { 313 LOG.warn("{}", serverName, ioe); 314 // Something wrong reading from fs. Returning 'true' to bring on more fs activity 315 return true; 316 } 317 if (fss != null) { 318 for (FileStatus fileStatus: fss) { 319 if (fileStatus.isDirectory()) { 320 // Not testing for existence; presuming exists if we got it out of getWALDirPaths 321 // listing. I used to test for presence of WAL and return false if empty but it can be 322 // empty if a clean shutdown. Even clean shutdowns need to be recovered so the meta 323 // and namespace assigns get triggered. 324 return true; 325 } 326 } 327 } 328 return false; 329 } 330 331 /** 332 * Depends on current FS Layout! 333 * @return The Path to the WAL directory for <code>serverName</code> 334 */ 335 Path getWALDirectoryName(ServerName serverName) { 336 return new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); 337 } 338 339 /** 340 * Finds WAL dirs for <code>serverNames</code> and renames them with '-splitting' suffix. 341 * @return List of '-splitting' directories that pertain to <code>serverNames</code> 342 */ 343 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification= 344 "We only release this lock when we set it. Updates to code that uses it should verify use " + 345 "of the guard boolean.") 346 List<Path> createAndGetLogDirs(final Set<ServerName> serverNames) throws IOException { 347 List<Path> logDirs = new ArrayList<>(); 348 boolean needReleaseLock = false; 349 if (!this.services.isInitialized()) { 350 // during master initialization, we could have multiple places splitting a same wal 351 // XXX: Does this still exist after we move to proc-v2? 352 this.splitLogLock.lock(); 353 needReleaseLock = true; 354 } 355 try { 356 for (ServerName serverName : serverNames) { 357 Path logDir = getWALDirectoryName(serverName); 358 // This adds the -splitting suffix to logDir. 359 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 360 // Rename the directory so a rogue RS doesn't create more WALs 361 if (fs.exists(logDir)) { 362 if (!this.fs.rename(logDir, splitDir)) { 363 throw new IOException("Failed fs.rename for log split: " + logDir); 364 } 365 logDir = splitDir; 366 LOG.debug("Renamed region directory: " + splitDir); 367 } else if (!fs.exists(splitDir)) { 368 LOG.info("Log dir for server " + serverName + " does not exist"); 369 continue; 370 } 371 logDirs.add(splitDir); 372 } 373 } catch (IOException ioe) { 374 if (!checkFileSystem()) { 375 this.services.abort("Aborting due to filesystem unavailable", ioe); 376 throw ioe; 377 } 378 } finally { 379 if (needReleaseLock) { 380 this.splitLogLock.unlock(); 381 } 382 } 383 return logDirs; 384 } 385 386 public void splitLog(final Set<ServerName> serverNames) throws IOException { 387 splitLog(serverNames, NON_META_FILTER); 388 } 389 390 /** 391 * This method is the base split method that splits WAL files matching a filter. Callers should 392 * pass the appropriate filter for meta and non-meta WALs. 393 * @param serverNames logs belonging to these servers will be split; this will rename the log 394 * directory out from under a soft-failed server 395 */ 396 public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException { 397 long splitTime = 0, splitLogSize = 0; 398 List<Path> logDirs = createAndGetLogDirs(serverNames); 399 400 splitLogManager.handleDeadWorkers(serverNames); 401 splitTime = EnvironmentEdgeManager.currentTime(); 402 splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter); 403 splitTime = EnvironmentEdgeManager.currentTime() - splitTime; 404 405 if (this.metricsMasterFilesystem != null) { 406 if (filter == META_FILTER) { 407 this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize); 408 } else { 409 this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize); 410 } 411 } 412 } 413 414 /** 415 * For meta region open and closed normally on a server, it may leave some meta 416 * WAL in the server's wal dir. Since meta region is no long on this server, 417 * The SCP won't split those meta wals, just leaving them there. So deleting 418 * the wal dir will fail since the dir is not empty. Actually We can safely achive those 419 * meta log and Archiving the meta log and delete the dir. 420 * @param serverName the server to archive meta log 421 */ 422 public void archiveMetaLog(final ServerName serverName) { 423 try { 424 Path logDir = new Path(this.rootDir, 425 AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); 426 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); 427 if (fs.exists(splitDir)) { 428 FileStatus[] logfiles = FSUtils.listStatus(fs, splitDir, META_FILTER); 429 if (logfiles != null) { 430 for (FileStatus status : logfiles) { 431 if (!status.isDir()) { 432 Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir, 433 status.getPath()); 434 if (!FSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) { 435 LOG.warn("Unable to move " + status.getPath() + " to " + newPath); 436 } else { 437 LOG.debug("Archived meta log " + status.getPath() + " to " + newPath); 438 } 439 } 440 } 441 } 442 if (!fs.delete(splitDir, false)) { 443 LOG.warn("Unable to delete log dir. Ignoring. " + splitDir); 444 } 445 } 446 } catch (IOException ie) { 447 LOG.warn("Failed archiving meta log for server " + serverName, ie); 448 } 449 } 450 451 452}