001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.Collections;
023import java.util.HashSet;
024import java.util.List;
025import java.util.Set;
026import java.util.concurrent.locks.Lock;
027import java.util.concurrent.locks.ReentrantLock;
028import java.util.stream.Collectors;
029import java.util.stream.Stream;
030import org.apache.hadoop.conf.Configuration;
031import org.apache.hadoop.fs.FileStatus;
032import org.apache.hadoop.fs.FileSystem;
033import org.apache.hadoop.fs.Path;
034import org.apache.hadoop.fs.PathFilter;
035import org.apache.hadoop.hbase.HConstants;
036import org.apache.hadoop.hbase.ServerName;
037import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
038import org.apache.hadoop.hbase.util.CommonFSUtils;
039import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
040import org.apache.hadoop.hbase.util.FSUtils;
041import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
042import org.apache.hadoop.hbase.wal.WALSplitter;
043import org.apache.yetus.audience.InterfaceAudience;
044import org.slf4j.Logger;
045import org.slf4j.LoggerFactory;
046
047/**
048 * This class abstracts a bunch of operations the HMaster needs when splitting log files e.g.
049 * finding log files, dirs etc.
050 */
051@InterfaceAudience.Private
052public class MasterWalManager {
053  private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
054
055  /**
056   * Filter *in* WAL files that are for the hbase:meta Region.
057   */
058  final static PathFilter META_FILTER = new PathFilter() {
059    @Override
060    public boolean accept(Path p) {
061      return AbstractFSWALProvider.isMetaFile(p);
062    }
063  };
064
065  /**
066   * Filter *out* WAL files that are for the hbase:meta Region; i.e. return user-space WALs only.
067   */
068  public final static PathFilter NON_META_FILTER = new PathFilter() {
069    @Override
070    public boolean accept(Path p) {
071      return !AbstractFSWALProvider.isMetaFile(p);
072    }
073  };
074
075  // metrics for master
076  // TODO: Rename it, since those metrics are split-manager related
077  private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
078
079  // Keep around for convenience.
080  private final MasterServices services;
081  private final Configuration conf;
082  private final FileSystem fs;
083
084  // The Path to the old logs dir
085  private final Path oldLogDir;
086
087  private final Path rootDir;
088
089  // create the split log lock
090  private final Lock splitLogLock = new ReentrantLock();
091
092  // old WALs directory size in bytes
093  private long oldWALsDirSize;
094
095  /**
096   * Superceded by {@link SplitWALManager}; i.e. procedure-based WAL splitting rather than 'classic'
097   * zk-coordinated WAL splitting.
098   * @deprecated since 2.3.0 and 3.0.0 to be removed in 4.0.0; replaced by {@link SplitWALManager}.
099   * @see SplitWALManager
100   */
101  @Deprecated
102  private final SplitLogManager splitLogManager;
103
104  // Is the fileystem ok?
105  private volatile boolean fsOk = true;
106
107  public MasterWalManager(MasterServices services) throws IOException {
108    this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(),
109      services.getMasterFileSystem().getWALRootDir(), services);
110  }
111
112  public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterServices services)
113    throws IOException {
114    this.fs = fs;
115    this.conf = conf;
116    this.rootDir = rootDir;
117    this.services = services;
118    this.splitLogManager = new SplitLogManager(services, conf);
119    this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
120    this.oldWALsDirSize = 0;
121  }
122
123  public void stop() {
124    if (splitLogManager != null) {
125      splitLogManager.stop();
126    }
127  }
128
129  SplitLogManager getSplitLogManager() {
130    return this.splitLogManager;
131  }
132
133  /**
134   * Get the directory where old logs go
135   * @return the dir
136   */
137  Path getOldLogDir() {
138    return this.oldLogDir;
139  }
140
141  public void updateOldWALsDirSize() throws IOException {
142    this.oldWALsDirSize = fs.getContentSummary(this.oldLogDir).getLength();
143  }
144
145  public long getOldWALsDirSize() {
146    return this.oldWALsDirSize;
147  }
148
149  public FileSystem getFileSystem() {
150    return this.fs;
151  }
152
153  /**
154   * Checks to see if the file system is still accessible. If not, sets closed
155   * @return false if file system is not available
156   */
157  private boolean checkFileSystem() {
158    if (this.fsOk) {
159      try {
160        FSUtils.checkFileSystemAvailable(this.fs);
161        FSUtils.checkDfsSafeMode(this.conf);
162      } catch (IOException e) {
163        services.abort("Shutting down HBase cluster: file system not available", e);
164        this.fsOk = false;
165      }
166    }
167    return this.fsOk;
168  }
169
170  /**
171   * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
172   */
173  public Set<ServerName> getSplittingServersFromWALDir() throws IOException {
174    return getServerNamesFromWALDirPath(
175      p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
176  }
177
178  /**
179   * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
180   * are already being split -- they cannot be 'alive'.
181   */
182  public Set<ServerName> getLiveServersFromWALDir() throws IOException {
183    return getServerNamesFromWALDirPath(
184      p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
185  }
186
187  /** Returns listing of ServerNames found by parsing WAL directory paths in FS. */
188  public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
189    FileStatus[] walDirForServerNames = getWALDirPaths(filter);
190    return Stream.of(walDirForServerNames).map(s -> {
191      ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
192      if (serverName == null) {
193        LOG.warn("Log folder {} doesn't look like its name includes a "
194          + "region server name; leaving in place. If you see later errors about missing "
195          + "write ahead logs they may be saved in this location.", s.getPath());
196        return null;
197      }
198      return serverName;
199    }).filter(s -> s != null).collect(Collectors.toSet());
200  }
201
202  /**
203   * Returns List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
204   */
205  public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
206    Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
207    FileStatus[] walDirForServerNames = CommonFSUtils.listStatus(fs, walDirPath, filter);
208    return walDirForServerNames == null ? new FileStatus[0] : walDirForServerNames;
209  }
210
211  /**
212   * Inspect the log directory to find dead servers which need recovery work
213   * @return A set of ServerNames which aren't running but still have WAL files left in file system
214   * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
215   *             to scan the wal directory to find out the splitting wal directory any more. Leave
216   *             it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
217   *             it.
218   */
219  @Deprecated
220  public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
221    boolean retrySplitting =
222      !conf.getBoolean(WALSplitter.SPLIT_SKIP_ERRORS_KEY, WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
223
224    Set<ServerName> serverNames = new HashSet<>();
225    Path logsDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
226
227    do {
228      if (services.isStopped()) {
229        LOG.warn("Master stopped while trying to get failed servers.");
230        break;
231      }
232      try {
233        if (!this.fs.exists(logsDirPath)) return serverNames;
234        FileStatus[] logFolders = CommonFSUtils.listStatus(this.fs, logsDirPath, null);
235        // Get online servers after getting log folders to avoid log folder deletion of newly
236        // checked in region servers . see HBASE-5916
237        Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
238
239        if (logFolders == null || logFolders.length == 0) {
240          LOG.debug("No log files to split, proceeding...");
241          return serverNames;
242        }
243        for (FileStatus status : logFolders) {
244          FileStatus[] curLogFiles = CommonFSUtils.listStatus(this.fs, status.getPath(), null);
245          if (curLogFiles == null || curLogFiles.length == 0) {
246            // Empty log folder. No recovery needed
247            continue;
248          }
249          final ServerName serverName =
250            AbstractFSWALProvider.getServerNameFromWALDirectoryName(status.getPath());
251          if (null == serverName) {
252            LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a "
253              + "region server name; leaving in place. If you see later errors about missing "
254              + "write ahead logs they may be saved in this location.");
255          } else if (!onlineServers.contains(serverName)) {
256            LOG.info("Log folder " + status.getPath() + " doesn't belong "
257              + "to a known region server, splitting");
258            serverNames.add(serverName);
259          } else {
260            LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
261          }
262        }
263        retrySplitting = false;
264      } catch (IOException ioe) {
265        LOG.warn("Failed getting failed servers to be recovered.", ioe);
266        if (!checkFileSystem()) {
267          LOG.warn("Bad Filesystem, exiting");
268          Runtime.getRuntime().halt(1);
269        }
270        try {
271          if (retrySplitting) {
272            Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
273          }
274        } catch (InterruptedException e) {
275          LOG.warn("Interrupted, aborting since cannot return w/o splitting");
276          Thread.currentThread().interrupt();
277          retrySplitting = false;
278          Runtime.getRuntime().halt(1);
279        }
280      }
281    } while (retrySplitting);
282
283    return serverNames;
284  }
285
286  public void splitLog(final ServerName serverName) throws IOException {
287    splitLog(Collections.<ServerName> singleton(serverName));
288  }
289
290  /**
291   * Specialized method to handle the splitting for meta WAL
292   * @param serverName logs belonging to this server will be split
293   */
294  public void splitMetaLog(final ServerName serverName) throws IOException {
295    splitMetaLog(Collections.<ServerName> singleton(serverName));
296  }
297
298  /**
299   * Specialized method to handle the splitting for meta WAL
300   * @param serverNames logs belonging to these servers will be split
301   */
302  public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
303    splitLog(serverNames, META_FILTER);
304  }
305
306  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UL_UNRELEASED_LOCK",
307      justification = "We only release this lock when we set it. Updates to code "
308        + "that uses it should verify use of the guard boolean.")
309  List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException {
310    List<Path> logDirs = new ArrayList<>();
311    boolean needReleaseLock = false;
312    if (!this.services.isInitialized()) {
313      // during master initialization, we could have multiple places splitting a same wal
314      // XXX: Does this still exist after we move to proc-v2?
315      this.splitLogLock.lock();
316      needReleaseLock = true;
317    }
318    try {
319      for (ServerName serverName : serverNames) {
320        Path logDir =
321          new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
322        Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
323        // Rename the directory so a rogue RS doesn't create more WALs
324        if (fs.exists(logDir)) {
325          if (!this.fs.rename(logDir, splitDir)) {
326            throw new IOException("Failed fs.rename for log split: " + logDir);
327          }
328          logDir = splitDir;
329          LOG.debug("Renamed region directory: " + splitDir);
330        } else if (!fs.exists(splitDir)) {
331          LOG.info("Log dir for server " + serverName + " does not exist");
332          continue;
333        }
334        logDirs.add(splitDir);
335      }
336    } catch (IOException ioe) {
337      if (!checkFileSystem()) {
338        this.services.abort("Aborting due to filesystem unavailable", ioe);
339        throw ioe;
340      }
341    } finally {
342      if (needReleaseLock) {
343        this.splitLogLock.unlock();
344      }
345    }
346    return logDirs;
347  }
348
349  public void splitLog(final Set<ServerName> serverNames) throws IOException {
350    splitLog(serverNames, NON_META_FILTER);
351  }
352
353  /**
354   * This method is the base split method that splits WAL files matching a filter. Callers should
355   * pass the appropriate filter for meta and non-meta WALs.
356   * @param serverNames logs belonging to these servers will be split; this will rename the log
357   *                    directory out from under a soft-failed server
358   */
359  public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
360    long splitTime = 0, splitLogSize = 0;
361    List<Path> logDirs = getLogDirs(serverNames);
362
363    splitLogManager.handleDeadWorkers(serverNames);
364    splitTime = EnvironmentEdgeManager.currentTime();
365    splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
366    splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
367
368    if (this.metricsMasterFilesystem != null) {
369      if (filter == META_FILTER) {
370        this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
371      } else {
372        this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
373      }
374    }
375  }
376
377  /**
378   * The hbase:meta region may OPEN and CLOSE without issue on a server and then move elsewhere. On
379   * CLOSE, the WAL for the hbase:meta table may not be archived yet (The WAL is only needed if
380   * hbase:meta did not close cleanaly). Since meta region is no long on this server, the
381   * ServerCrashProcedure won't split these leftover hbase:meta WALs, just leaving them in the WAL
382   * splitting dir. If we try to delete the WAL splitting for the server, it fail since the dir is
383   * not totally empty. We can safely archive these hbase:meta log; then the WAL dir can be deleted.
384   * @param serverName the server to archive meta log
385   */
386  public void archiveMetaLog(final ServerName serverName) {
387    try {
388      Path logDir =
389        new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
390      Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
391      if (fs.exists(splitDir)) {
392        FileStatus[] logfiles = CommonFSUtils.listStatus(fs, splitDir, META_FILTER);
393        if (logfiles != null) {
394          for (FileStatus status : logfiles) {
395            if (!status.isDir()) {
396              Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir, status.getPath());
397              if (!CommonFSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
398                LOG.warn("Unable to move  " + status.getPath() + " to " + newPath);
399              } else {
400                LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
401              }
402            }
403          }
404        }
405        if (!fs.delete(splitDir, false)) {
406          LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
407        }
408      }
409    } catch (IOException ie) {
410      LOG.warn("Failed archiving meta log for server " + serverName, ie);
411    }
412  }
413}