001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.master;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.Collections;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Set;
027import java.util.concurrent.locks.Lock;
028import java.util.concurrent.locks.ReentrantLock;
029import java.util.stream.Collectors;
030import java.util.stream.Stream;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FileStatus;
033import org.apache.hadoop.fs.FileSystem;
034import org.apache.hadoop.fs.Path;
035import org.apache.hadoop.fs.PathFilter;
036import org.apache.hadoop.hbase.HConstants;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
039import org.apache.hadoop.hbase.util.CommonFSUtils;
040import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
041import org.apache.hadoop.hbase.util.FSUtils;
042import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
043import org.apache.hadoop.hbase.wal.WALSplitter;
044import org.apache.yetus.audience.InterfaceAudience;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
048
049/**
050 * This class abstracts a bunch of operations the HMaster needs
051 * when splitting log files e.g. finding log files, dirs etc.
052 */
053@InterfaceAudience.Private
054public class MasterWalManager {
055  private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
056
057  final static PathFilter META_FILTER = new PathFilter() {
058    @Override
059    public boolean accept(Path p) {
060      return AbstractFSWALProvider.isMetaFile(p);
061    }
062  };
063
064  final static PathFilter NON_META_FILTER = new PathFilter() {
065    @Override
066    public boolean accept(Path p) {
067      return !AbstractFSWALProvider.isMetaFile(p);
068    }
069  };
070
071  // metrics for master
072  // TODO: Rename it, since those metrics are split-manager related
073  private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
074
075  // Keep around for convenience.
076  private final MasterServices services;
077  private final Configuration conf;
078  private final FileSystem fs;
079
080  // The Path to the old logs dir
081  private final Path oldLogDir;
082
083  /**
084   * This is the hbase rootdir.
085   * We'll put the WALs under this dir.
086   */
087  private final Path rootDir;
088
089  // create the split log lock
090  private final Lock splitLogLock = new ReentrantLock();
091  private final SplitLogManager splitLogManager;
092
093  // Is the fileystem ok?
094  private volatile boolean fsOk = true;
095
096  public MasterWalManager(MasterServices services) throws IOException {
097    this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(), services);
098  }
099
100  public MasterWalManager(Configuration conf, FileSystem fs,  MasterServices services)
101      throws IOException {
102    this.fs = fs;
103    this.conf = conf;
104    this.rootDir = CommonFSUtils.getWALRootDir(conf);
105    this.services = services;
106    this.splitLogManager = new SplitLogManager(services, conf);
107
108    this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
109  }
110
111  public void stop() {
112    if (splitLogManager != null) {
113      splitLogManager.stop();
114    }
115  }
116
117  @VisibleForTesting
118  SplitLogManager getSplitLogManager() {
119    return this.splitLogManager;
120  }
121
122  /**
123   * Get the directory where old logs go
124   * @return the dir
125   */
126  Path getOldLogDir() {
127    return this.oldLogDir;
128  }
129
130  public FileSystem getFileSystem() {
131    return this.fs;
132  }
133
134  /**
135   * Checks to see if the file system is still accessible.
136   * If not, sets closed
137   * @return false if file system is not available
138   */
139  private boolean checkFileSystem() {
140    if (this.fsOk) {
141      try {
142        FSUtils.checkFileSystemAvailable(this.fs);
143        FSUtils.checkDfsSafeMode(this.conf);
144      } catch (IOException e) {
145        services.abort("Shutting down HBase cluster: file system not available", e);
146        this.fsOk = false;
147      }
148    }
149    return this.fsOk;
150  }
151
152  /**
153   * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
154   * @return ServerName
155   * @throws IOException IOException
156   */
157  public Set<ServerName> getSplittingServersFromWALDir() throws  IOException {
158    return getServerNamesFromWALDirPath(
159      p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
160  }
161
162  /**
163   * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
164   * are already being split -- they cannot be 'alive'.
165   * @return ServerName
166   * @throws IOException IOException
167   */
168  public Set<ServerName> getLiveServersFromWALDir() throws IOException {
169    return getServerNamesFromWALDirPath(
170      p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
171  }
172
173  /**
174   * @return listing of ServerNames found by parsing WAL directory paths in FS.
175   *
176   */
177  public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
178    FileStatus[] walDirForServerNames = getWALDirPaths(filter);
179    return Stream.of(walDirForServerNames).map(s -> {
180      ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
181      if (serverName == null) {
182        LOG.warn("Log folder {} doesn't look like its name includes a " +
183          "region server name; leaving in place. If you see later errors about missing " +
184          "write ahead logs they may be saved in this location.", s.getPath());
185        return null;
186      }
187      return serverName;
188    }).filter(s -> s != null).collect(Collectors.toSet());
189  }
190
191  /**
192   * @return Returns the WALs dir under <code>rootDir</code>
193   * @throws IOException
194   */
195  Path getWALDirPath() throws IOException {
196    return new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
197  }
198
199  /**
200   * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
201   */
202  public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
203    Path walDirPath = getWALDirPath();
204    FileStatus[] walDirForServerNames = FSUtils.listStatus(fs, walDirPath, filter);
205    return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
206  }
207
208  /**
209   * Inspect the log directory to find dead servers which need recovery work
210   * @return A set of ServerNames which aren't running but still have WAL files left in file system
211   * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
212   *             to scan the wal directory to find out the splitting wal directory any more. Leave
213   *             it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
214   *             it.
215   */
216  @Deprecated
217  public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
218    boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
219        WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
220
221    Set<ServerName> serverNames = new HashSet<>();
222    Path logsDirPath = getWALDirPath();
223
224    do {
225      if (services.isStopped()) {
226        LOG.warn("Master stopped while trying to get failed servers.");
227        break;
228      }
229      try {
230        if (!this.fs.exists(logsDirPath)) return serverNames;
231        FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
232        // Get online servers after getting log folders to avoid log folder deletion of newly
233        // checked in region servers . see HBASE-5916
234        Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
235
236        if (logFolders == null || logFolders.length == 0) {
237          LOG.debug("No log files to split, proceeding...");
238          return serverNames;
239        }
240        for (FileStatus status : logFolders) {
241          FileStatus[] curLogFiles = FSUtils.listStatus(this.fs, status.getPath(), null);
242          if (curLogFiles == null || curLogFiles.length == 0) {
243            // Empty log folder. No recovery needed
244            continue;
245          }
246          final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(
247              status.getPath());
248          if (null == serverName) {
249            LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " +
250                "region server name; leaving in place. If you see later errors about missing " +
251                "write ahead logs they may be saved in this location.");
252          } else if (!onlineServers.contains(serverName)) {
253            LOG.info("Log folder " + status.getPath() + " doesn't belong "
254                + "to a known region server, splitting");
255            serverNames.add(serverName);
256          } else {
257            LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
258          }
259        }
260        retrySplitting = false;
261      } catch (IOException ioe) {
262        LOG.warn("Failed getting failed servers to be recovered.", ioe);
263        if (!checkFileSystem()) {
264          LOG.warn("Bad Filesystem, exiting");
265          Runtime.getRuntime().halt(1);
266        }
267        try {
268          if (retrySplitting) {
269            Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
270          }
271        } catch (InterruptedException e) {
272          LOG.warn("Interrupted, aborting since cannot return w/o splitting");
273          Thread.currentThread().interrupt();
274          retrySplitting = false;
275          Runtime.getRuntime().halt(1);
276        }
277      }
278    } while (retrySplitting);
279
280    return serverNames;
281  }
282
283  public void splitLog(final ServerName serverName) throws IOException {
284    splitLog(Collections.<ServerName>singleton(serverName));
285  }
286
287  /**
288   * Specialized method to handle the splitting for meta WAL
289   * @param serverName logs belonging to this server will be split
290   */
291  public void splitMetaLog(final ServerName serverName) throws IOException {
292    splitMetaLog(Collections.<ServerName>singleton(serverName));
293  }
294
295  /**
296   * Specialized method to handle the splitting for meta WAL
297   * @param serverNames logs belonging to these servers will be split
298   */
299  public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
300    splitLog(serverNames, META_FILTER);
301  }
302
303  /**
304   * @return True if a WAL directory exists (will return true also if WALs found in
305   *   servername'-splitting' too).
306   */
307  boolean isWALDirectoryNameWithWALs(ServerName serverName) {
308    FileStatus [] fss = null;
309    try {
310      // 'startsWith' will also return dirs ending in AbstractFSWALProvider.SPLITTING_EXT
311      fss = getWALDirPaths(p -> p.getName().startsWith(serverName.toString()));
312    } catch (IOException ioe) {
313      LOG.warn("{}", serverName, ioe);
314      // Something wrong reading from fs. Returning 'true' to bring on more fs activity
315      return true;
316    }
317    if (fss != null) {
318      for (FileStatus fileStatus: fss) {
319        if (fileStatus.isDirectory()) {
320          // Not testing for existence; presuming exists if we got it out of getWALDirPaths
321          // listing. I used to test for presence of WAL and return false if empty but it can be
322          // empty if a clean shutdown. Even clean shutdowns need to be recovered so the meta
323          // and namespace assigns get triggered.
324          return true;
325        }
326      }
327    }
328    return false;
329  }
330
331  /**
332   * Depends on current FS Layout!
333   * @return The Path to the WAL directory for <code>serverName</code>
334   */
335  Path getWALDirectoryName(ServerName serverName) {
336    return new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
337  }
338
339  /**
340   * Finds WAL dirs for <code>serverNames</code> and renames them with '-splitting' suffix.
341   * @return List of '-splitting' directories that pertain to <code>serverNames</code>
342   */
343  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
344      "We only release this lock when we set it. Updates to code that uses it should verify use " +
345      "of the guard boolean.")
346  List<Path> createAndGetLogDirs(final Set<ServerName> serverNames) throws IOException {
347    List<Path> logDirs = new ArrayList<>();
348    boolean needReleaseLock = false;
349    if (!this.services.isInitialized()) {
350      // during master initialization, we could have multiple places splitting a same wal
351      // XXX: Does this still exist after we move to proc-v2?
352      this.splitLogLock.lock();
353      needReleaseLock = true;
354    }
355    try {
356      for (ServerName serverName : serverNames) {
357        Path logDir = getWALDirectoryName(serverName);
358        // This adds the -splitting suffix to logDir.
359        Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
360        // Rename the directory so a rogue RS doesn't create more WALs
361        if (fs.exists(logDir)) {
362          if (!this.fs.rename(logDir, splitDir)) {
363            throw new IOException("Failed fs.rename for log split: " + logDir);
364          }
365          logDir = splitDir;
366          LOG.debug("Renamed region directory: " + splitDir);
367        } else if (!fs.exists(splitDir)) {
368          LOG.info("Log dir for server " + serverName + " does not exist");
369          continue;
370        }
371        logDirs.add(splitDir);
372      }
373    } catch (IOException ioe) {
374      if (!checkFileSystem()) {
375        this.services.abort("Aborting due to filesystem unavailable", ioe);
376        throw ioe;
377      }
378    } finally {
379      if (needReleaseLock) {
380        this.splitLogLock.unlock();
381      }
382    }
383    return logDirs;
384  }
385
386  public void splitLog(final Set<ServerName> serverNames) throws IOException {
387    splitLog(serverNames, NON_META_FILTER);
388  }
389
390  /**
391   * This method is the base split method that splits WAL files matching a filter. Callers should
392   * pass the appropriate filter for meta and non-meta WALs.
393   * @param serverNames logs belonging to these servers will be split; this will rename the log
394   *                    directory out from under a soft-failed server
395   */
396  public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
397    long splitTime = 0, splitLogSize = 0;
398    List<Path> logDirs = createAndGetLogDirs(serverNames);
399
400    splitLogManager.handleDeadWorkers(serverNames);
401    splitTime = EnvironmentEdgeManager.currentTime();
402    splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
403    splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
404
405    if (this.metricsMasterFilesystem != null) {
406      if (filter == META_FILTER) {
407        this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
408      } else {
409        this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
410      }
411    }
412  }
413
414  /**
415   * For meta region open and closed normally on a server, it may leave some meta
416   * WAL in the server's wal dir. Since meta region is no long on this server,
417   * The SCP won't split those meta wals, just leaving them there. So deleting
418   * the wal dir will fail since the dir is not empty. Actually We can safely achive those
419   * meta log and Archiving the meta log and delete the dir.
420   * @param serverName the server to archive meta log
421   */
422  public void archiveMetaLog(final ServerName serverName) {
423    try {
424      Path logDir = new Path(this.rootDir,
425          AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
426      Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
427      if (fs.exists(splitDir)) {
428        FileStatus[] logfiles = FSUtils.listStatus(fs, splitDir, META_FILTER);
429        if (logfiles != null) {
430          for (FileStatus status : logfiles) {
431            if (!status.isDir()) {
432              Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir,
433                  status.getPath());
434              if (!FSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
435                LOG.warn("Unable to move  " + status.getPath() + " to " + newPath);
436              } else {
437                LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
438              }
439            }
440          }
441        }
442        if (!fs.delete(splitDir, false)) {
443          LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
444        }
445      }
446    } catch (IOException ie) {
447      LOG.warn("Failed archiving meta log for server " + serverName, ie);
448    }
449  }
450
451
452}