001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.IOException;
021import java.util.ArrayList;
022
023import org.apache.hadoop.util.Shell.ExitCodeException;
024import org.apache.hadoop.util.Shell.ShellCommandExecutor;
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028/**
029 * A utility for executing an external script that checks the health of
030 * the node. An example script can be found at
031 * <tt>src/main/sh/healthcheck/healthcheck.sh</tt> in the
032 * <tt>hbase-examples</tt> module.
033 */
034class HealthChecker {
035
036  private static final Logger LOG = LoggerFactory.getLogger(HealthChecker.class);
037  private ShellCommandExecutor shexec = null;
038  private String exceptionStackTrace;
039
040  /** Pattern used for searching in the output of the node health script */
041  static private final String ERROR_PATTERN = "ERROR";
042
043  private String healthCheckScript;
044  private long scriptTimeout;
045
046  enum HealthCheckerExitStatus {
047    SUCCESS,
048    TIMED_OUT,
049    FAILED_WITH_EXIT_CODE,
050    FAILED_WITH_EXCEPTION,
051    FAILED
052  }
053
054  /**
055   * Initialize.
056   *
057   * @param configuration
058   */
059  public void init(String location, long timeout) {
060    this.healthCheckScript = location;
061    this.scriptTimeout = timeout;
062    ArrayList<String> execScript = new ArrayList<>();
063    execScript.add(healthCheckScript);
064    this.shexec = new ShellCommandExecutor(execScript.toArray(new String[execScript.size()]), null,
065        null, scriptTimeout);
066    LOG.info("HealthChecker initialized with script at " + this.healthCheckScript +
067      ", timeout=" + timeout);
068  }
069
070  public HealthReport checkHealth() {
071    HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS;
072    try {
073      // Calling this execute leaves around running executor threads.
074      shexec.execute();
075    } catch (ExitCodeException e) {
076      // ignore the exit code of the script
077      LOG.warn("Caught exception : " + e + ",exit code:" + e.getExitCode());
078      status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE;
079    } catch (IOException e) {
080      LOG.warn("Caught exception : " + e);
081      status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION;
082      exceptionStackTrace = org.apache.hadoop.util.StringUtils.stringifyException(e);
083    } finally {
084      if (shexec.isTimedOut()) {
085        status = HealthCheckerExitStatus.TIMED_OUT;
086      }
087      if (status == HealthCheckerExitStatus.SUCCESS) {
088        if (hasErrors(shexec.getOutput())) {
089          status = HealthCheckerExitStatus.FAILED;
090        }
091      }
092    }
093    return new HealthReport(status, getHealthReport(status));
094  }
095
096  private boolean hasErrors(String output) {
097    String[] splits = output.split("\n");
098    for (String split : splits) {
099      if (split.startsWith(ERROR_PATTERN)) {
100        return true;
101      }
102    }
103    return false;
104  }
105
106  private String getHealthReport(HealthCheckerExitStatus status){
107    String healthReport = null;
108    switch (status) {
109    case SUCCESS:
110      healthReport = "Server is healthy.";
111      break;
112    case TIMED_OUT:
113      healthReport = "Health script timed out";
114      break;
115    case FAILED_WITH_EXCEPTION:
116      healthReport = exceptionStackTrace;
117      break;
118    case FAILED_WITH_EXIT_CODE:
119      healthReport = "Health script failed with exit code.";
120      break;
121    case FAILED:
122      healthReport = shexec.getOutput();
123      break;
124    }
125    return healthReport;
126  }
127}