View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase;
19  
20  import org.apache.commons.logging.Log;
21  import org.apache.commons.logging.LogFactory;
22  import org.apache.hadoop.conf.Configuration;
23  import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
24  import org.apache.hadoop.util.StringUtils;
25  
26  /**
27   * The Class HealthCheckChore for running health checker regularly.
28   */
29  public class HealthCheckChore extends ScheduledChore {
30    private static final Log LOG = LogFactory.getLog(HealthCheckChore.class);
31    private HealthChecker healthChecker;
32    private Configuration config;
33    private int threshold;
34    private int numTimesUnhealthy = 0;
35    private long failureWindow;
36    private long startWindow;
37  
38    public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) {
39      super("HealthChecker", stopper, sleepTime);
40      LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime));
41      this.config = conf;
42      String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC);
43      long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT,
44        HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT);
45      healthChecker = new HealthChecker();
46      healthChecker.init(healthCheckScript, scriptTimeout);
47      this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD,
48        HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD);
49      this.failureWindow = (long)this.threshold * (long)sleepTime;
50    }
51  
52    @Override
53    protected void chore() {
54      HealthReport report = healthChecker.checkHealth();
55      boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS);
56      if (!isHealthy) {
57        boolean needToStop = decideToStop();
58        if (needToStop) {
59          getStopper().stop(
60            "The  node reported unhealthy " + threshold + " number of times consecutively.");
61        }
62        // Always log health report.
63        LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : "
64            + report.getHealthReport());
65      }
66    }
67  
68    private boolean decideToStop() {
69      boolean stop = false;
70      if (numTimesUnhealthy == 0) {
71        // First time we are seeing a failure. No need to stop, just
72        // record the time.
73        numTimesUnhealthy++;
74        startWindow = System.currentTimeMillis();
75      } else {
76        if ((System.currentTimeMillis() - startWindow) < failureWindow) {
77          numTimesUnhealthy++;
78          if (numTimesUnhealthy == threshold) {
79            stop = true;
80          } else {
81            stop = false;
82          }
83        } else {
84          // Outside of failure window, so we reset to 1.
85          numTimesUnhealthy = 1;
86          startWindow = System.currentTimeMillis();
87          stop = false;
88        }
89      }
90      return stop;
91    }
92  
93  }