001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import java.lang.management.GarbageCollectorMXBean;
021import java.lang.management.ManagementFactory;
022import java.util.List;
023import java.util.Map;
024import java.util.Set;
025import java.util.concurrent.TimeUnit;
026
027import org.apache.yetus.audience.InterfaceAudience;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource;
031import org.apache.hadoop.conf.Configuration;
032
033import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
034import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
035import org.apache.hbase.thirdparty.com.google.common.base.Stopwatch;
036import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
037import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
038import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
039
040/**
041 * Class which sets up a simple thread which runs in a loop sleeping
042 * for a short interval of time. If the sleep takes significantly longer
043 * than its target time, it implies that the JVM or host machine has
044 * paused processing, which may cause other problems. If such a pause is
045 * detected, the thread logs a message.
046 * The original JvmPauseMonitor is:
047 * ${hadoop-common-project}/hadoop-common/src/main/java/org/apache/hadoop/util/
048 * JvmPauseMonitor.java
049 * r1503806 | cmccabe | 2013-07-17 01:48:24 +0800 (Wed, 17 Jul 2013) | 1 line
050 * HADOOP-9618.  thread which detects GC pauses(Todd Lipcon)
051 */
052@InterfaceAudience.Private
053public class JvmPauseMonitor {
054  private static final Logger LOG = LoggerFactory.getLogger(JvmPauseMonitor.class);
055
056  /** The target sleep time */
057  private static final long SLEEP_INTERVAL_MS = 500;
058  
059  /** log WARN if we detect a pause longer than this threshold */
060  private final long warnThresholdMs;
061  public static final String WARN_THRESHOLD_KEY =
062      "jvm.pause.warn-threshold.ms";
063  private static final long WARN_THRESHOLD_DEFAULT = 10000;
064  
065  /** log INFO if we detect a pause longer than this threshold */
066  private final long infoThresholdMs;
067  public static final String INFO_THRESHOLD_KEY =
068      "jvm.pause.info-threshold.ms";
069  private static final long INFO_THRESHOLD_DEFAULT = 1000;
070
071  private Thread monitorThread;
072  private volatile boolean shouldRun = true;
073  private JvmPauseMonitorSource metricsSource;
074
075  public JvmPauseMonitor(Configuration conf) {
076    this(conf, null);
077  }
078
079  public JvmPauseMonitor(Configuration conf, JvmPauseMonitorSource metricsSource) {
080    this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
081    this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
082    this.metricsSource = metricsSource;
083  }
084  
085  public void start() {
086    Preconditions.checkState(monitorThread == null, "Already started");
087    monitorThread = new Thread(new Monitor(), "JvmPauseMonitor");
088    monitorThread.setDaemon(true);
089    monitorThread.start();
090  }
091
092  public void stop() {
093    shouldRun = false;
094    monitorThread.interrupt();
095    try {
096      monitorThread.join();
097    } catch (InterruptedException e) {
098      Thread.currentThread().interrupt();
099    }
100  }
101  
102  private String formatMessage(long extraSleepTime, List<String> gcDiffs) {
103    String ret = "Detected pause in JVM or host machine (eg GC): " + "pause of approximately "
104        + extraSleepTime + "ms\n";
105    if (gcDiffs.isEmpty()) {
106      ret += "No GCs detected";
107    } else {
108      ret += Joiner.on("\n").join(gcDiffs);
109    }
110    return ret;
111  }
112  
113  private Map<String, GcTimes> getGcTimes() {
114    Map<String, GcTimes> map = Maps.newHashMap();
115    List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
116    for (GarbageCollectorMXBean gcBean : gcBeans) {
117      map.put(gcBean.getName(), new GcTimes(gcBean));
118    }
119    return map;
120  }
121
122  private static class GcTimes {
123    private GcTimes(GarbageCollectorMXBean gcBean) {
124      gcCount = gcBean.getCollectionCount();
125      gcTimeMillis = gcBean.getCollectionTime();
126    }
127
128    private GcTimes(long count, long time) {
129      this.gcCount = count;
130      this.gcTimeMillis = time;
131    }
132
133    private GcTimes subtract(GcTimes other) {
134      return new GcTimes(this.gcCount - other.gcCount, this.gcTimeMillis - other.gcTimeMillis);
135    }
136
137    @Override
138    public String toString() {
139      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
140    }
141
142    private long gcCount;
143    private long gcTimeMillis;
144  }
145
146  private class Monitor implements Runnable {
147    @Override
148    public void run() {
149      Stopwatch sw = Stopwatch.createUnstarted();
150      Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
151      while (shouldRun) {
152        sw.reset().start();
153        try {
154          Thread.sleep(SLEEP_INTERVAL_MS);
155        } catch (InterruptedException ie) {
156          return;
157        }
158
159        long extraSleepTime = sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS;
160        Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
161
162        if (extraSleepTime > infoThresholdMs) {
163          Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
164            gcTimesBeforeSleep.keySet());
165          List<String> gcDiffs = Lists.newArrayList();
166          for (String name : gcBeanNames) {
167            GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
168            if (diff.gcCount != 0) {
169              gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
170            }
171          }
172
173          updateMetrics(extraSleepTime, !gcDiffs.isEmpty());
174
175          if (extraSleepTime > warnThresholdMs) {
176            LOG.warn(formatMessage(extraSleepTime, gcDiffs));
177          } else {
178            LOG.info(formatMessage(extraSleepTime, gcDiffs));
179          }
180        }
181        gcTimesBeforeSleep = gcTimesAfterSleep;
182      }
183    }
184  }
185
186  public void updateMetrics(long sleepTime, boolean gcDetected) {
187    if (metricsSource != null) {
188      if (sleepTime > warnThresholdMs) {
189        metricsSource.incWarnThresholdExceeded(1);
190      } else {
191        metricsSource.incInfoThresholdExceeded(1);
192      }
193      if (gcDetected) {
194        metricsSource.updatePauseTimeWithGc(sleepTime);
195      } else {
196        metricsSource.updatePauseTimeWithoutGc(sleepTime);
197      }
198    }
199  }
200
201  public JvmPauseMonitorSource getMetricsSource() {
202    return metricsSource;
203  }
204
205  public void setMetricsSource(JvmPauseMonitorSource metricsSource) {
206    this.metricsSource = metricsSource;
207  }
208
209  /**
210   * Simple 'main' to facilitate manual testing of the pause monitor.
211   * 
212   * This main function just leaks memory into a list. Running this class
213   * with a 1GB heap will very quickly go into "GC hell" and result in
214   * log messages about the GC pauses.
215   */
216  public static void main(String []args) throws Exception {
217    new JvmPauseMonitor(new Configuration()).start();
218    List<String> list = Lists.newArrayList();
219    int i = 0;
220    while (true) {
221      list.add(String.valueOf(i++));
222    }
223  }
224}