001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.master;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.Collections;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Set;
027import java.util.concurrent.locks.Lock;
028import java.util.concurrent.locks.ReentrantLock;
029import java.util.stream.Collectors;
030import java.util.stream.Stream;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FileStatus;
033import org.apache.hadoop.fs.FileSystem;
034import org.apache.hadoop.fs.Path;
035import org.apache.hadoop.fs.PathFilter;
036import org.apache.hadoop.hbase.HConstants;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
039import org.apache.hadoop.hbase.util.CommonFSUtils;
040import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
041import org.apache.hadoop.hbase.util.FSUtils;
042import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
043import org.apache.hadoop.hbase.wal.WALSplitter;
044import org.apache.yetus.audience.InterfaceAudience;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047
048import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
049
050/**
051 * This class abstracts a bunch of operations the HMaster needs
052 * when splitting log files e.g. finding log files, dirs etc.
053 */
054@InterfaceAudience.Private
055public class MasterWalManager {
056  private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
057
058  final static PathFilter META_FILTER = new PathFilter() {
059    @Override
060    public boolean accept(Path p) {
061      return AbstractFSWALProvider.isMetaFile(p);
062    }
063  };
064
065  @VisibleForTesting
066  public final static PathFilter NON_META_FILTER = new PathFilter() {
067    @Override
068    public boolean accept(Path p) {
069      return !AbstractFSWALProvider.isMetaFile(p);
070    }
071  };
072
073  // metrics for master
074  // TODO: Rename it, since those metrics are split-manager related
075  private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
076
077  // Keep around for convenience.
078  private final MasterServices services;
079  private final Configuration conf;
080  private final FileSystem fs;
081
082  // The Path to the old logs dir
083  private final Path oldLogDir;
084  private final Path rootDir;
085
086  // create the split log lock
087  private final Lock splitLogLock = new ReentrantLock();
088  private final SplitLogManager splitLogManager;
089
090  // Is the fileystem ok?
091  private volatile boolean fsOk = true;
092
093  public MasterWalManager(MasterServices services) throws IOException {
094    this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(), services);
095  }
096
097  public MasterWalManager(Configuration conf, FileSystem fs,  MasterServices services)
098      throws IOException {
099    this.fs = fs;
100    this.conf = conf;
101    this.rootDir = CommonFSUtils.getWALRootDir(conf);
102    this.services = services;
103    this.splitLogManager = new SplitLogManager(services, conf);
104
105    this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
106  }
107
108  public void stop() {
109    if (splitLogManager != null) {
110      splitLogManager.stop();
111    }
112  }
113
114  @VisibleForTesting
115  SplitLogManager getSplitLogManager() {
116    return this.splitLogManager;
117  }
118
119  /**
120   * Get the directory where old logs go
121   * @return the dir
122   */
123  Path getOldLogDir() {
124    return this.oldLogDir;
125  }
126
127  public FileSystem getFileSystem() {
128    return this.fs;
129  }
130
131  /**
132   * Checks to see if the file system is still accessible.
133   * If not, sets closed
134   * @return false if file system is not available
135   */
136  private boolean checkFileSystem() {
137    if (this.fsOk) {
138      try {
139        FSUtils.checkFileSystemAvailable(this.fs);
140        FSUtils.checkDfsSafeMode(this.conf);
141      } catch (IOException e) {
142        services.abort("Shutting down HBase cluster: file system not available", e);
143        this.fsOk = false;
144      }
145    }
146    return this.fsOk;
147  }
148
149  /**
150   * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
151   * @return ServerName
152   * @throws IOException IOException
153   */
154  public Set<ServerName> getSplittingServersFromWALDir() throws  IOException {
155    return getServerNamesFromWALDirPath(
156      p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
157  }
158
159  /**
160   * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
161   * are already being split -- they cannot be 'alive'.
162   * @return ServerName
163   * @throws IOException IOException
164   */
165  public Set<ServerName> getLiveServersFromWALDir() throws IOException {
166    return getServerNamesFromWALDirPath(
167      p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
168  }
169
170  /**
171   * @return listing of ServerNames found by parsing WAL directory paths in FS.
172   */
173  public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
174    FileStatus[] walDirForServerNames = getWALDirPaths(filter);
175    return Stream.of(walDirForServerNames).map(s -> {
176      ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
177      if (serverName == null) {
178        LOG.warn("Log folder {} doesn't look like its name includes a " +
179          "region server name; leaving in place. If you see later errors about missing " +
180          "write ahead logs they may be saved in this location.", s.getPath());
181        return null;
182      }
183      return serverName;
184    }).filter(s -> s != null).collect(Collectors.toSet());
185  }
186
187  /**
188   * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
189   */
190  public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
191    Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
192    FileStatus[] walDirForServerNames = CommonFSUtils.listStatus(fs, walDirPath, filter);
193    return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
194  }
195
196  /**
197   * Inspect the log directory to find dead servers which need recovery work
198   * @return A set of ServerNames which aren't running but still have WAL files left in file system
199   * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
200   *             to scan the wal directory to find out the splitting wal directory any more. Leave
201   *             it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
202   *             it.
203   */
204  @Deprecated
205  public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
206    boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
207        WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
208
209    Set<ServerName> serverNames = new HashSet<>();
210    Path logsDirPath = new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME);
211
212    do {
213      if (services.isStopped()) {
214        LOG.warn("Master stopped while trying to get failed servers.");
215        break;
216      }
217      try {
218        if (!this.fs.exists(logsDirPath)) return serverNames;
219        FileStatus[] logFolders = CommonFSUtils.listStatus(this.fs, logsDirPath, null);
220        // Get online servers after getting log folders to avoid log folder deletion of newly
221        // checked in region servers . see HBASE-5916
222        Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
223
224        if (logFolders == null || logFolders.length == 0) {
225          LOG.debug("No log files to split, proceeding...");
226          return serverNames;
227        }
228        for (FileStatus status : logFolders) {
229          FileStatus[] curLogFiles = CommonFSUtils.listStatus(this.fs, status.getPath(), null);
230          if (curLogFiles == null || curLogFiles.length == 0) {
231            // Empty log folder. No recovery needed
232            continue;
233          }
234          final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(
235              status.getPath());
236          if (null == serverName) {
237            LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " +
238                "region server name; leaving in place. If you see later errors about missing " +
239                "write ahead logs they may be saved in this location.");
240          } else if (!onlineServers.contains(serverName)) {
241            LOG.info("Log folder " + status.getPath() + " doesn't belong "
242                + "to a known region server, splitting");
243            serverNames.add(serverName);
244          } else {
245            LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
246          }
247        }
248        retrySplitting = false;
249      } catch (IOException ioe) {
250        LOG.warn("Failed getting failed servers to be recovered.", ioe);
251        if (!checkFileSystem()) {
252          LOG.warn("Bad Filesystem, exiting");
253          Runtime.getRuntime().halt(1);
254        }
255        try {
256          if (retrySplitting) {
257            Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
258          }
259        } catch (InterruptedException e) {
260          LOG.warn("Interrupted, aborting since cannot return w/o splitting");
261          Thread.currentThread().interrupt();
262          retrySplitting = false;
263          Runtime.getRuntime().halt(1);
264        }
265      }
266    } while (retrySplitting);
267
268    return serverNames;
269  }
270
271  public void splitLog(final ServerName serverName) throws IOException {
272    splitLog(Collections.<ServerName>singleton(serverName));
273  }
274
275  /**
276   * Specialized method to handle the splitting for meta WAL
277   * @param serverName logs belonging to this server will be split
278   */
279  public void splitMetaLog(final ServerName serverName) throws IOException {
280    splitMetaLog(Collections.<ServerName>singleton(serverName));
281  }
282
283  /**
284   * Specialized method to handle the splitting for meta WAL
285   * @param serverNames logs belonging to these servers will be split
286   */
287  public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
288    splitLog(serverNames, META_FILTER);
289  }
290
291  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
292      "We only release this lock when we set it. Updates to code that uses it should verify use " +
293      "of the guard boolean.")
294  List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException {
295    List<Path> logDirs = new ArrayList<>();
296    boolean needReleaseLock = false;
297    if (!this.services.isInitialized()) {
298      // during master initialization, we could have multiple places splitting a same wal
299      // XXX: Does this still exist after we move to proc-v2?
300      this.splitLogLock.lock();
301      needReleaseLock = true;
302    }
303    try {
304      for (ServerName serverName : serverNames) {
305        Path logDir = new Path(this.rootDir,
306          AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
307        Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
308        // Rename the directory so a rogue RS doesn't create more WALs
309        if (fs.exists(logDir)) {
310          if (!this.fs.rename(logDir, splitDir)) {
311            throw new IOException("Failed fs.rename for log split: " + logDir);
312          }
313          logDir = splitDir;
314          LOG.debug("Renamed region directory: " + splitDir);
315        } else if (!fs.exists(splitDir)) {
316          LOG.info("Log dir for server " + serverName + " does not exist");
317          continue;
318        }
319        logDirs.add(splitDir);
320      }
321    } catch (IOException ioe) {
322      if (!checkFileSystem()) {
323        this.services.abort("Aborting due to filesystem unavailable", ioe);
324        throw ioe;
325      }
326    } finally {
327      if (needReleaseLock) {
328        this.splitLogLock.unlock();
329      }
330    }
331    return logDirs;
332  }
333
334  public void splitLog(final Set<ServerName> serverNames) throws IOException {
335    splitLog(serverNames, NON_META_FILTER);
336  }
337
338  /**
339   * This method is the base split method that splits WAL files matching a filter. Callers should
340   * pass the appropriate filter for meta and non-meta WALs.
341   * @param serverNames logs belonging to these servers will be split; this will rename the log
342   *                    directory out from under a soft-failed server
343   */
344  public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
345    long splitTime = 0, splitLogSize = 0;
346    List<Path> logDirs = getLogDirs(serverNames);
347
348    splitLogManager.handleDeadWorkers(serverNames);
349    splitTime = EnvironmentEdgeManager.currentTime();
350    splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
351    splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
352
353    if (this.metricsMasterFilesystem != null) {
354      if (filter == META_FILTER) {
355        this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
356      } else {
357        this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
358      }
359    }
360  }
361
362  /**
363   * For meta region open and closed normally on a server, it may leave some meta
364   * WAL in the server's wal dir. Since meta region is no long on this server,
365   * The SCP won't split those meta wals, just leaving them there. So deleting
366   * the wal dir will fail since the dir is not empty. Actually We can safely achive those
367   * meta log and Archiving the meta log and delete the dir.
368   * @param serverName the server to archive meta log
369   */
370  public void archiveMetaLog(final ServerName serverName) {
371    try {
372      Path logDir = new Path(this.rootDir,
373          AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
374      Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
375      if (fs.exists(splitDir)) {
376        FileStatus[] logfiles = CommonFSUtils.listStatus(fs, splitDir, META_FILTER);
377        if (logfiles != null) {
378          for (FileStatus status : logfiles) {
379            if (!status.isDir()) {
380              Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir,
381                  status.getPath());
382              if (!CommonFSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
383                LOG.warn("Unable to move  " + status.getPath() + " to " + newPath);
384              } else {
385                LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
386              }
387            }
388          }
389        }
390        if (!fs.delete(splitDir, false)) {
391          LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
392        }
393      }
394    } catch (IOException ie) {
395      LOG.warn("Failed archiving meta log for server " + serverName, ie);
396    }
397  }
398
399
400}