001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.master;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.Collections;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Set;
027import java.util.concurrent.locks.Lock;
028import java.util.concurrent.locks.ReentrantLock;
029import java.util.stream.Collectors;
030import java.util.stream.Stream;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FileStatus;
033import org.apache.hadoop.fs.FileSystem;
034import org.apache.hadoop.fs.Path;
035import org.apache.hadoop.fs.PathFilter;
036import org.apache.hadoop.hbase.HConstants;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
039import org.apache.hadoop.hbase.util.CommonFSUtils;
040import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
041import org.apache.hadoop.hbase.util.FSUtils;
042import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
043import org.apache.hadoop.hbase.wal.WALSplitter;
044import org.apache.yetus.audience.InterfaceAudience;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
048
049/**
050 * This class abstracts a bunch of operations the HMaster needs
051 * when splitting log files e.g. finding log files, dirs etc.
052 */
053@InterfaceAudience.Private
054public class MasterWalManager {
055  private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
056
057  final static PathFilter META_FILTER = new PathFilter() {
058    @Override
059    public boolean accept(Path p) {
060      return AbstractFSWALProvider.isMetaFile(p);
061    }
062  };
063
064  @VisibleForTesting
065  public final static PathFilter NON_META_FILTER = new PathFilter() {
066    @Override
067    public boolean accept(Path p) {
068      return !AbstractFSWALProvider.isMetaFile(p);
069    }
070  };
071
072  // metrics for master
073  // TODO: Rename it, since those metrics are split-manager related
074  private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
075
076  // Keep around for convenience.
077  private final MasterServices services;
078  private final Configuration conf;
079  private final FileSystem fs;
080
081  // The Path to the old logs dir
082  private final Path oldLogDir;
083  private final Path rootDir;
084
085  // create the split log lock
086  private final Lock splitLogLock = new ReentrantLock();
087  private final SplitLogManager splitLogManager;
088
089  // Is the fileystem ok?
090  private volatile boolean fsOk = true;
091
092  public MasterWalManager(MasterServices services) throws IOException {
093    this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(), services);
094  }
095
096  public MasterWalManager(Configuration conf, FileSystem fs,  MasterServices services)
097      throws IOException {
098    this.fs = fs;
099    this.conf = conf;
100    this.rootDir = CommonFSUtils.getWALRootDir(conf);
101    this.services = services;
102    this.splitLogManager = new SplitLogManager(services, conf);
103
104    this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
105  }
106
107  public void stop() {
108    if (splitLogManager != null) {
109      splitLogManager.stop();
110    }
111  }
112
113  @VisibleForTesting
114  SplitLogManager getSplitLogManager() {
115    return this.splitLogManager;
116  }
117
118  /**
119   * Get the directory where old logs go
120   * @return the dir
121   */
122  Path getOldLogDir() {
123    return this.oldLogDir;
124  }
125
126  public FileSystem getFileSystem() {
127    return this.fs;
128  }
129
130  /**
131   * Checks to see if the file system is still accessible.
132   * If not, sets closed
133   * @return false if file system is not available
134   */
135  private boolean checkFileSystem() {
136    if (this.fsOk) {
137      try {
138        FSUtils.checkFileSystemAvailable(this.fs);
139        FSUtils.checkDfsSafeMode(this.conf);
140      } catch (IOException e) {
141        services.abort("Shutting down HBase cluster: file system not available", e);
142        this.fsOk = false;
143      }
144    }
145    return this.fsOk;
146  }
147
148  /**
149   * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
150   * @return ServerName
151   * @throws IOException IOException
152   */
153  public Set<ServerName> getSplittingServersFromWALDir() throws  IOException {
154    return getServerNamesFromWALDirPath(
155      p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
156  }
157
158  /**
159   * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
160   * are already being split -- they cannot be 'alive'.
161   * @return ServerName
162   * @throws IOException IOException
163   */
164  public Set<ServerName> getLiveServersFromWALDir() throws IOException {
165    return getServerNamesFromWALDirPath(
166      p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
167  }
168
169  /**
170   * @return listing of ServerNames found by parsing WAL directory paths in FS.
171   */
172  public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
173    FileStatus[] walDirForServerNames = getWALDirPaths(filter);
174    return Stream.of(walDirForServerNames).map(s -> {
175      ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
176      if (serverName == null) {
177        LOG.warn("Log folder {} doesn't look like its name includes a " +
178          "region server name; leaving in place. If you see later errors about missing " +
179          "write ahead logs they may be saved in this location.", s.getPath());
180        return null;
181      }
182      return serverName;
183    }).filter(s -> s != null).collect(Collectors.toSet());
184  }
185
186  /**
187   * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
188   */
189  public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
190    Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
191    FileStatus[] walDirForServerNames = FSUtils.listStatus(fs, walDirPath, filter);
192    return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
193  }
194
195  /**
196   * Inspect the log directory to find dead servers which need recovery work
197   * @return A set of ServerNames which aren't running but still have WAL files left in file system
198   * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
199   *             to scan the wal directory to find out the splitting wal directory any more. Leave
200   *             it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
201   *             it.
202   */
203  @Deprecated
204  public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
205    boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
206        WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
207
208    Set<ServerName> serverNames = new HashSet<>();
209    Path logsDirPath = new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME);
210
211    do {
212      if (services.isStopped()) {
213        LOG.warn("Master stopped while trying to get failed servers.");
214        break;
215      }
216      try {
217        if (!this.fs.exists(logsDirPath)) return serverNames;
218        FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
219        // Get online servers after getting log folders to avoid log folder deletion of newly
220        // checked in region servers . see HBASE-5916
221        Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
222
223        if (logFolders == null || logFolders.length == 0) {
224          LOG.debug("No log files to split, proceeding...");
225          return serverNames;
226        }
227        for (FileStatus status : logFolders) {
228          FileStatus[] curLogFiles = FSUtils.listStatus(this.fs, status.getPath(), null);
229          if (curLogFiles == null || curLogFiles.length == 0) {
230            // Empty log folder. No recovery needed
231            continue;
232          }
233          final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(
234              status.getPath());
235          if (null == serverName) {
236            LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " +
237                "region server name; leaving in place. If you see later errors about missing " +
238                "write ahead logs they may be saved in this location.");
239          } else if (!onlineServers.contains(serverName)) {
240            LOG.info("Log folder " + status.getPath() + " doesn't belong "
241                + "to a known region server, splitting");
242            serverNames.add(serverName);
243          } else {
244            LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
245          }
246        }
247        retrySplitting = false;
248      } catch (IOException ioe) {
249        LOG.warn("Failed getting failed servers to be recovered.", ioe);
250        if (!checkFileSystem()) {
251          LOG.warn("Bad Filesystem, exiting");
252          Runtime.getRuntime().halt(1);
253        }
254        try {
255          if (retrySplitting) {
256            Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
257          }
258        } catch (InterruptedException e) {
259          LOG.warn("Interrupted, aborting since cannot return w/o splitting");
260          Thread.currentThread().interrupt();
261          retrySplitting = false;
262          Runtime.getRuntime().halt(1);
263        }
264      }
265    } while (retrySplitting);
266
267    return serverNames;
268  }
269
270  public void splitLog(final ServerName serverName) throws IOException {
271    splitLog(Collections.<ServerName>singleton(serverName));
272  }
273
274  /**
275   * Specialized method to handle the splitting for meta WAL
276   * @param serverName logs belonging to this server will be split
277   */
278  public void splitMetaLog(final ServerName serverName) throws IOException {
279    splitMetaLog(Collections.<ServerName>singleton(serverName));
280  }
281
282  /**
283   * Specialized method to handle the splitting for meta WAL
284   * @param serverNames logs belonging to these servers will be split
285   */
286  public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
287    splitLog(serverNames, META_FILTER);
288  }
289
290  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
291      "We only release this lock when we set it. Updates to code that uses it should verify use " +
292      "of the guard boolean.")
293  List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException {
294    List<Path> logDirs = new ArrayList<>();
295    boolean needReleaseLock = false;
296    if (!this.services.isInitialized()) {
297      // during master initialization, we could have multiple places splitting a same wal
298      // XXX: Does this still exist after we move to proc-v2?
299      this.splitLogLock.lock();
300      needReleaseLock = true;
301    }
302    try {
303      for (ServerName serverName : serverNames) {
304        Path logDir = new Path(this.rootDir,
305          AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
306        Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
307        // Rename the directory so a rogue RS doesn't create more WALs
308        if (fs.exists(logDir)) {
309          if (!this.fs.rename(logDir, splitDir)) {
310            throw new IOException("Failed fs.rename for log split: " + logDir);
311          }
312          logDir = splitDir;
313          LOG.debug("Renamed region directory: " + splitDir);
314        } else if (!fs.exists(splitDir)) {
315          LOG.info("Log dir for server " + serverName + " does not exist");
316          continue;
317        }
318        logDirs.add(splitDir);
319      }
320    } catch (IOException ioe) {
321      if (!checkFileSystem()) {
322        this.services.abort("Aborting due to filesystem unavailable", ioe);
323        throw ioe;
324      }
325    } finally {
326      if (needReleaseLock) {
327        this.splitLogLock.unlock();
328      }
329    }
330    return logDirs;
331  }
332
333  public void splitLog(final Set<ServerName> serverNames) throws IOException {
334    splitLog(serverNames, NON_META_FILTER);
335  }
336
337  /**
338   * This method is the base split method that splits WAL files matching a filter. Callers should
339   * pass the appropriate filter for meta and non-meta WALs.
340   * @param serverNames logs belonging to these servers will be split; this will rename the log
341   *                    directory out from under a soft-failed server
342   */
343  public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
344    long splitTime = 0, splitLogSize = 0;
345    List<Path> logDirs = getLogDirs(serverNames);
346
347    splitLogManager.handleDeadWorkers(serverNames);
348    splitTime = EnvironmentEdgeManager.currentTime();
349    splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
350    splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
351
352    if (this.metricsMasterFilesystem != null) {
353      if (filter == META_FILTER) {
354        this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
355      } else {
356        this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
357      }
358    }
359  }
360
361  /**
362   * For meta region open and closed normally on a server, it may leave some meta
363   * WAL in the server's wal dir. Since meta region is no long on this server,
364   * The SCP won't split those meta wals, just leaving them there. So deleting
365   * the wal dir will fail since the dir is not empty. Actually We can safely achive those
366   * meta log and Archiving the meta log and delete the dir.
367   * @param serverName the server to archive meta log
368   */
369  public void archiveMetaLog(final ServerName serverName) {
370    try {
371      Path logDir = new Path(this.rootDir,
372          AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
373      Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
374      if (fs.exists(splitDir)) {
375        FileStatus[] logfiles = FSUtils.listStatus(fs, splitDir, META_FILTER);
376        if (logfiles != null) {
377          for (FileStatus status : logfiles) {
378            if (!status.isDir()) {
379              Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir,
380                  status.getPath());
381              if (!FSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
382                LOG.warn("Unable to move  " + status.getPath() + " to " + newPath);
383              } else {
384                LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
385              }
386            }
387          }
388        }
389        if (!fs.delete(splitDir, false)) {
390          LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
391        }
392      }
393    } catch (IOException ie) {
394      LOG.warn("Failed archiving meta log for server " + serverName, ie);
395    }
396  }
397
398
399}