001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import org.apache.hadoop.conf.Configuration; 021import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus; 022import org.apache.hadoop.util.StringUtils; 023import org.apache.yetus.audience.InterfaceAudience; 024import org.slf4j.Logger; 025import org.slf4j.LoggerFactory; 026 027/** 028 * The Class HealthCheckChore for running health checker regularly. 029 */ 030@InterfaceAudience.Private 031public class HealthCheckChore extends ScheduledChore { 032 private static final Logger LOG = LoggerFactory.getLogger(HealthCheckChore.class); 033 private HealthChecker healthChecker; 034 private Configuration config; 035 private int threshold; 036 private int numTimesUnhealthy = 0; 037 private long failureWindow; 038 private long startWindow; 039 040 public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) { 041 super("HealthChecker", stopper, sleepTime); 042 LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime)); 043 this.config = conf; 044 String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC); 045 long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 046 HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT); 047 healthChecker = new HealthChecker(); 048 healthChecker.init(healthCheckScript, scriptTimeout); 049 this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD, 050 HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD); 051 this.failureWindow = (long)this.threshold * (long)sleepTime; 052 } 053 054 @Override 055 protected void chore() { 056 HealthReport report = healthChecker.checkHealth(); 057 boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS); 058 if (!isHealthy) { 059 boolean needToStop = decideToStop(); 060 if (needToStop) { 061 this.getStopper().stop("The node reported unhealthy " + threshold 062 + " number of times consecutively."); 063 } 064 // Always log health report. 065 LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : " 066 + report.getHealthReport()); 067 } 068 } 069 070 private boolean decideToStop() { 071 boolean stop = false; 072 if (numTimesUnhealthy == 0) { 073 // First time we are seeing a failure. No need to stop, just 074 // record the time. 075 numTimesUnhealthy++; 076 startWindow = System.currentTimeMillis(); 077 } else { 078 if ((System.currentTimeMillis() - startWindow) < failureWindow) { 079 numTimesUnhealthy++; 080 if (numTimesUnhealthy == threshold) { 081 stop = true; 082 } else { 083 stop = false; 084 } 085 } else { 086 // Outside of failure window, so we reset to 1. 087 numTimesUnhealthy = 1; 088 startWindow = System.currentTimeMillis(); 089 stop = false; 090 } 091 } 092 return stop; 093 } 094 095}