001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.util; 019 020import java.lang.management.GarbageCollectorMXBean; 021import java.lang.management.ManagementFactory; 022import java.util.List; 023import java.util.Map; 024import java.util.Set; 025import java.util.concurrent.TimeUnit; 026 027import org.apache.yetus.audience.InterfaceAudience; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource; 031import org.apache.hadoop.conf.Configuration; 032 033import org.apache.hbase.thirdparty.com.google.common.base.Joiner; 034import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 035import org.apache.hbase.thirdparty.com.google.common.base.Stopwatch; 036import org.apache.hbase.thirdparty.com.google.common.collect.Lists; 037import org.apache.hbase.thirdparty.com.google.common.collect.Maps; 038import org.apache.hbase.thirdparty.com.google.common.collect.Sets; 039 040/** 041 * Class which sets up a simple thread which runs in a loop sleeping 042 * for a short interval of time. If the sleep takes significantly longer 043 * than its target time, it implies that the JVM or host machine has 044 * paused processing, which may cause other problems. If such a pause is 045 * detected, the thread logs a message. 046 * The original JvmPauseMonitor is: 047 * ${hadoop-common-project}/hadoop-common/src/main/java/org/apache/hadoop/util/ 048 * JvmPauseMonitor.java 049 * r1503806 | cmccabe | 2013-07-17 01:48:24 +0800 (Wed, 17 Jul 2013) | 1 line 050 * HADOOP-9618. thread which detects GC pauses(Todd Lipcon) 051 */ 052@InterfaceAudience.Private 053public class JvmPauseMonitor { 054 private static final Logger LOG = LoggerFactory.getLogger(JvmPauseMonitor.class); 055 056 /** The target sleep time */ 057 private static final long SLEEP_INTERVAL_MS = 500; 058 059 /** log WARN if we detect a pause longer than this threshold */ 060 private final long warnThresholdMs; 061 public static final String WARN_THRESHOLD_KEY = 062 "jvm.pause.warn-threshold.ms"; 063 private static final long WARN_THRESHOLD_DEFAULT = 10000; 064 065 /** log INFO if we detect a pause longer than this threshold */ 066 private final long infoThresholdMs; 067 public static final String INFO_THRESHOLD_KEY = 068 "jvm.pause.info-threshold.ms"; 069 private static final long INFO_THRESHOLD_DEFAULT = 1000; 070 071 private Thread monitorThread; 072 private volatile boolean shouldRun = true; 073 private JvmPauseMonitorSource metricsSource; 074 075 public JvmPauseMonitor(Configuration conf) { 076 this(conf, null); 077 } 078 079 public JvmPauseMonitor(Configuration conf, JvmPauseMonitorSource metricsSource) { 080 this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT); 081 this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT); 082 this.metricsSource = metricsSource; 083 } 084 085 public void start() { 086 Preconditions.checkState(monitorThread == null, "Already started"); 087 monitorThread = new Thread(new Monitor(), "JvmPauseMonitor"); 088 monitorThread.setDaemon(true); 089 monitorThread.start(); 090 } 091 092 public void stop() { 093 shouldRun = false; 094 monitorThread.interrupt(); 095 try { 096 monitorThread.join(); 097 } catch (InterruptedException e) { 098 Thread.currentThread().interrupt(); 099 } 100 } 101 102 private String formatMessage(long extraSleepTime, List<String> gcDiffs) { 103 String ret = "Detected pause in JVM or host machine (eg GC): " + "pause of approximately " 104 + extraSleepTime + "ms\n"; 105 if (gcDiffs.isEmpty()) { 106 ret += "No GCs detected"; 107 } else { 108 ret += Joiner.on("\n").join(gcDiffs); 109 } 110 return ret; 111 } 112 113 private Map<String, GcTimes> getGcTimes() { 114 Map<String, GcTimes> map = Maps.newHashMap(); 115 List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans(); 116 for (GarbageCollectorMXBean gcBean : gcBeans) { 117 map.put(gcBean.getName(), new GcTimes(gcBean)); 118 } 119 return map; 120 } 121 122 private static class GcTimes { 123 private GcTimes(GarbageCollectorMXBean gcBean) { 124 gcCount = gcBean.getCollectionCount(); 125 gcTimeMillis = gcBean.getCollectionTime(); 126 } 127 128 private GcTimes(long count, long time) { 129 this.gcCount = count; 130 this.gcTimeMillis = time; 131 } 132 133 private GcTimes subtract(GcTimes other) { 134 return new GcTimes(this.gcCount - other.gcCount, this.gcTimeMillis - other.gcTimeMillis); 135 } 136 137 @Override 138 public String toString() { 139 return "count=" + gcCount + " time=" + gcTimeMillis + "ms"; 140 } 141 142 private long gcCount; 143 private long gcTimeMillis; 144 } 145 146 private class Monitor implements Runnable { 147 @Override 148 public void run() { 149 Stopwatch sw = Stopwatch.createUnstarted(); 150 Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes(); 151 while (shouldRun) { 152 sw.reset().start(); 153 try { 154 Thread.sleep(SLEEP_INTERVAL_MS); 155 } catch (InterruptedException ie) { 156 return; 157 } 158 159 long extraSleepTime = sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS; 160 Map<String, GcTimes> gcTimesAfterSleep = getGcTimes(); 161 162 if (extraSleepTime > infoThresholdMs) { 163 Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(), 164 gcTimesBeforeSleep.keySet()); 165 List<String> gcDiffs = Lists.newArrayList(); 166 for (String name : gcBeanNames) { 167 GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name)); 168 if (diff.gcCount != 0) { 169 gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString()); 170 } 171 } 172 173 updateMetrics(extraSleepTime, !gcDiffs.isEmpty()); 174 175 if (extraSleepTime > warnThresholdMs) { 176 LOG.warn(formatMessage(extraSleepTime, gcDiffs)); 177 } else { 178 LOG.info(formatMessage(extraSleepTime, gcDiffs)); 179 } 180 } 181 gcTimesBeforeSleep = gcTimesAfterSleep; 182 } 183 } 184 } 185 186 public void updateMetrics(long sleepTime, boolean gcDetected) { 187 if (metricsSource != null) { 188 if (sleepTime > warnThresholdMs) { 189 metricsSource.incWarnThresholdExceeded(1); 190 } else { 191 metricsSource.incInfoThresholdExceeded(1); 192 } 193 if (gcDetected) { 194 metricsSource.updatePauseTimeWithGc(sleepTime); 195 } else { 196 metricsSource.updatePauseTimeWithoutGc(sleepTime); 197 } 198 } 199 } 200 201 public JvmPauseMonitorSource getMetricsSource() { 202 return metricsSource; 203 } 204 205 public void setMetricsSource(JvmPauseMonitorSource metricsSource) { 206 this.metricsSource = metricsSource; 207 } 208 209 /** 210 * Simple 'main' to facilitate manual testing of the pause monitor. 211 * 212 * This main function just leaks memory into a list. Running this class 213 * with a 1GB heap will very quickly go into "GC hell" and result in 214 * log messages about the GC pauses. 215 */ 216 public static void main(String []args) throws Exception { 217 new JvmPauseMonitor(new Configuration()).start(); 218 List<String> list = Lists.newArrayList(); 219 int i = 0; 220 while (true) { 221 list.add(String.valueOf(i++)); 222 } 223 } 224}