001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import org.apache.hadoop.conf.Configuration;
021import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
022import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
023import org.apache.hadoop.util.StringUtils;
024import org.apache.yetus.audience.InterfaceAudience;
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028/**
029 * The Class HealthCheckChore for running health checker regularly.
030 */
031@InterfaceAudience.Private
032public class HealthCheckChore extends ScheduledChore {
033  private static final Logger LOG = LoggerFactory.getLogger(HealthCheckChore.class);
034  private HealthChecker healthChecker;
035  private Configuration config;
036  private int threshold;
037  private int numTimesUnhealthy = 0;
038  private long failureWindow;
039  private long startWindow;
040
041  public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) {
042    super("HealthChecker", stopper, sleepTime);
043    LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime));
044    this.config = conf;
045    String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC);
046    long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT,
047      HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT);
048    healthChecker = new HealthChecker();
049    healthChecker.init(healthCheckScript, scriptTimeout);
050    this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD,
051      HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD);
052    this.failureWindow = (long) this.threshold * (long) sleepTime;
053  }
054
055  @Override
056  protected void chore() {
057    HealthReport report = healthChecker.checkHealth();
058    boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS);
059    if (!isHealthy) {
060      boolean needToStop = decideToStop();
061      if (needToStop) {
062        this.getStopper()
063          .stop("The  node reported unhealthy " + threshold + " number of times consecutively.");
064      }
065      // Always log health report.
066      LOG.info("Health status at " + StringUtils.formatTime(EnvironmentEdgeManager.currentTime())
067        + " : " + report.getHealthReport());
068    }
069  }
070
071  private boolean decideToStop() {
072    boolean stop = false;
073    if (numTimesUnhealthy == 0) {
074      // First time we are seeing a failure. No need to stop, just
075      // record the time.
076      numTimesUnhealthy++;
077      startWindow = EnvironmentEdgeManager.currentTime();
078    } else {
079      if ((EnvironmentEdgeManager.currentTime() - startWindow) < failureWindow) {
080        numTimesUnhealthy++;
081        if (numTimesUnhealthy == threshold) {
082          stop = true;
083        } else {
084          stop = false;
085        }
086      } else {
087        // Outside of failure window, so we reset to 1.
088        numTimesUnhealthy = 1;
089        startWindow = EnvironmentEdgeManager.currentTime();
090        stop = false;
091      }
092    }
093    return stop;
094  }
095
096}