001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import org.apache.hadoop.conf.Configuration;
021import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
022import org.apache.hadoop.util.StringUtils;
023import org.apache.yetus.audience.InterfaceAudience;
024import org.slf4j.Logger;
025import org.slf4j.LoggerFactory;
026
027/**
028 * The Class HealthCheckChore for running health checker regularly.
029 */
030@InterfaceAudience.Private
031public class HealthCheckChore extends ScheduledChore {
032  private static final Logger LOG = LoggerFactory.getLogger(HealthCheckChore.class);
033  private HealthChecker healthChecker;
034  private Configuration config;
035  private int threshold;
036  private int numTimesUnhealthy = 0;
037  private long failureWindow;
038  private long startWindow;
039
040  public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) {
041    super("HealthChecker", stopper, sleepTime);
042    LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime));
043    this.config = conf;
044    String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC);
045    long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT,
046      HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT);
047    healthChecker = new HealthChecker();
048    healthChecker.init(healthCheckScript, scriptTimeout);
049    this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD,
050      HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD);
051    this.failureWindow = (long)this.threshold * (long)sleepTime;
052  }
053
054  @Override
055  protected void chore() {
056    HealthReport report = healthChecker.checkHealth();
057    boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS);
058    if (!isHealthy) {
059      boolean needToStop = decideToStop();
060      if (needToStop) {
061        this.getStopper().stop("The  node reported unhealthy " + threshold
062            + " number of times consecutively.");
063      }
064      // Always log health report.
065      LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : "
066          + report.getHealthReport());
067    }
068  }
069
070  private boolean decideToStop() {
071    boolean stop = false;
072    if (numTimesUnhealthy == 0) {
073      // First time we are seeing a failure. No need to stop, just
074      // record the time.
075      numTimesUnhealthy++;
076      startWindow = System.currentTimeMillis();
077    } else {
078      if ((System.currentTimeMillis() - startWindow) < failureWindow) {
079        numTimesUnhealthy++;
080        if (numTimesUnhealthy == threshold) {
081          stop = true;
082        } else {
083          stop = false;
084        }
085      } else {
086        // Outside of failure window, so we reset to 1.
087        numTimesUnhealthy = 1;
088        startWindow = System.currentTimeMillis();
089        stop = false;
090      }
091    }
092    return stop;
093  }
094
095}