001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.regionserver;
020
021import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.BLOCK_CACHE_SIZE_MAX_RANGE_KEY;
022import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.BLOCK_CACHE_SIZE_MIN_RANGE_KEY;
023import static org.apache.hadoop.hbase.HConstants.HFILE_BLOCK_CACHE_SIZE_KEY;
024import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.MEMSTORE_SIZE_MAX_RANGE_KEY;
025import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.MEMSTORE_SIZE_MIN_RANGE_KEY;
026
027import org.apache.yetus.audience.InterfaceAudience;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030import org.apache.hadoop.conf.Configuration;
031import org.apache.hadoop.hbase.HConstants;
032import org.apache.hadoop.hbase.io.util.MemorySizeUtil;
033import org.apache.hadoop.hbase.regionserver.HeapMemoryManager.TunerContext;
034import org.apache.hadoop.hbase.regionserver.HeapMemoryManager.TunerResult;
035import org.apache.hadoop.hbase.util.RollingStatCalculator;
036
037/**
038 * The default implementation for the HeapMemoryTuner. This will do statistical checks on
039 * number of evictions, cache misses and flushes to decide whether there should be changes
040 * in the heap size of memstore/block cache. During each tuner operation tuner takes a step
041 * which can either be INCREASE_BLOCK_CACHE_SIZE (increase block cache size),
042 * INCREASE_MEMSTORE_SIZE (increase memstore size) and by default it is NEUTRAL (no change).
043 * We say block cache is sufficient when there is no block cache eviction at all or major amount of
044 * memory allocated to block cache is empty, similarly we say memory allocated for memstore is
045 * sufficient when there is no memstore flushes because of heap pressure or major amount of
046 * memory allocated to memstore is empty. If both are sufficient we do nothing, if exactly one of
047 * them is found to be sufficient we decrease its size by <i>step</i> and increase the other by
048 * same amount. If none of them is sufficient we do statistical analysis on number of cache misses
049 * and flushes to determine tuner direction. Based on these statistics we decide the tuner
050 * direction. If we are not confident about which step direction to take we do nothing and wait for
051 * next iteration. On expectation we will be tuning for at least 10% tuner calls. The number of
052 * past periods to consider for statistics calculation can be specified in config by
053 * <i>hbase.regionserver.heapmemory.autotuner.lookup.periods</i>. Also these many initial calls to
054 * tuner will be ignored (cache is warming up and we leave the system to reach steady state).
055 * After the tuner takes a step, in next call we insure that last call was indeed helpful and did
056 * not do us any harm. If not then we revert the previous step. The step size is dynamic and it
057 * changes based on current and past few tuning directions and their step sizes. We maintain a
058 * parameter <i>decayingAvgTunerStepSize</i> which is sum of past tuner steps with
059 * sign(positive for increase in memstore and negative for increase in block cache). But rather
060 * than simple sum it is calculated by giving more priority to the recent tuning steps.
061 * When last few tuner steps were NETURAL then we assume we are restarting the tuning process and
062 * step size is updated to maximum allowed size which can be specified  in config by
063 * <i>hbase.regionserver.heapmemory.autotuner.step.max</i>. If in a particular tuning operation
064 * the step direction is opposite to what indicated by <i>decayingTunerStepSizeSum</i>
065 * we decrease the step size by half. Step size does not change in other tuning operations.
066 * When step size gets below a certain threshold then the following tuner operations are
067 * considered to be neutral. The minimum step size can be specified  in config by
068 * <i>hbase.regionserver.heapmemory.autotuner.step.min</i>.
069 */
070@InterfaceAudience.Private
071class DefaultHeapMemoryTuner implements HeapMemoryTuner {
072  public static final String MAX_STEP_KEY = "hbase.regionserver.heapmemory.autotuner.step.max";
073  public static final String MIN_STEP_KEY = "hbase.regionserver.heapmemory.autotuner.step.min";
074  public static final String SUFFICIENT_MEMORY_LEVEL_KEY =
075      "hbase.regionserver.heapmemory.autotuner.sufficient.memory.level";
076  public static final String LOOKUP_PERIODS_KEY =
077      "hbase.regionserver.heapmemory.autotuner.lookup.periods";
078  public static final String NUM_PERIODS_TO_IGNORE =
079      "hbase.regionserver.heapmemory.autotuner.ignored.periods";
080  // Maximum step size that the tuner can take
081  public static final float DEFAULT_MAX_STEP_VALUE = 0.04f; // 4%
082  // Minimum step size that the tuner can take
083  public static final float DEFAULT_MIN_STEP_VALUE = 0.00125f; // 0.125%
084  // If current block cache size or memstore size in use is below this level relative to memory
085  // provided to it then corresponding component will be considered to have sufficient memory
086  public static final float DEFAULT_SUFFICIENT_MEMORY_LEVEL_VALUE = 0.5f; // 50%
087  // Number of tuner periods that will be considered while calculating mean and deviation
088  // If set to zero, all stats will be calculated from the start
089  public static final int DEFAULT_LOOKUP_PERIODS = 60;
090  public static final int DEFAULT_NUM_PERIODS_IGNORED = 60;
091  private static final TunerResult NO_OP_TUNER_RESULT = new TunerResult(false);
092  // If deviation of tuner step size gets below this value then it means past few periods were
093  // NEUTRAL(given that last tuner period was also NEUTRAL).
094  private static final double TUNER_STEP_EPS = 1e-6;
095
096  private Logger LOG = LoggerFactory.getLogger(DefaultHeapMemoryTuner.class);
097  private TunerResult TUNER_RESULT = new TunerResult(true);
098  private Configuration conf;
099  private float sufficientMemoryLevel = DEFAULT_SUFFICIENT_MEMORY_LEVEL_VALUE;
100  private float maximumStepSize = DEFAULT_MAX_STEP_VALUE;
101  private float minimumStepSize = DEFAULT_MIN_STEP_VALUE;
102  private int tunerLookupPeriods = DEFAULT_LOOKUP_PERIODS;
103  private int numPeriodsToIgnore = DEFAULT_NUM_PERIODS_IGNORED;
104  // Counter to ignore few initial periods while cache is still warming up
105  // Memory tuner will do no operation for the first "tunerLookupPeriods"
106  private int ignoreInitialPeriods = 0;
107
108  private float globalMemStorePercentMinRange;
109  private float globalMemStorePercentMaxRange;
110  private float blockCachePercentMinRange;
111  private float blockCachePercentMaxRange;
112
113  private float globalMemStoreLimitLowMarkPercent;
114
115  // Store statistics about the corresponding parameters for memory tuning
116  private RollingStatCalculator rollingStatsForCacheMisses;
117  private RollingStatCalculator rollingStatsForFlushes;
118  private RollingStatCalculator rollingStatsForEvictions;
119  private RollingStatCalculator rollingStatsForTunerSteps;
120  // Set step size to max value for tuning, this step size will adjust dynamically while tuning
121  private float step = DEFAULT_MAX_STEP_VALUE;
122  private StepDirection prevTuneDirection = StepDirection.NEUTRAL;
123  //positive means memstore's size was increased
124  //It is not just arithmetic sum of past tuner periods. More priority is given to recent
125  //tuning steps.
126  private double decayingTunerStepSizeSum = 0;
127
128  @Override
129  public TunerResult tune(TunerContext context) {
130    float curMemstoreSize = context.getCurMemStoreSize();
131    float curBlockCacheSize = context.getCurBlockCacheSize();
132    addToRollingStats(context);
133
134    if (ignoreInitialPeriods < numPeriodsToIgnore) {
135      // Ignoring the first few tuner periods
136      ignoreInitialPeriods++;
137      rollingStatsForTunerSteps.insertDataValue(0);
138      return NO_OP_TUNER_RESULT;
139    }
140    StepDirection newTuneDirection = getTuneDirection(context);
141
142    long blockedFlushCount = context.getBlockedFlushCount();
143    long unblockedFlushCount = context.getUnblockedFlushCount();
144    long totalOnheapFlushCount = blockedFlushCount + unblockedFlushCount;
145    boolean offheapMemstore = context.isOffheapMemStore();
146    float newMemstoreSize;
147    float newBlockCacheSize;
148
149    // Adjusting step size for tuning to get to steady state or restart from steady state.
150    // Even if the step size was 4% and 32 GB memory size, we will be shifting 1 GB back and forth
151    // per tuner operation and it can affect the performance of cluster so we keep on decreasing
152    // step size until everything settles.
153    if (prevTuneDirection == StepDirection.NEUTRAL
154        && newTuneDirection != StepDirection.NEUTRAL
155        && rollingStatsForTunerSteps.getDeviation() < TUNER_STEP_EPS) {
156      // Restarting the tuning from steady state and setting step size to maximum.
157      // The deviation cannot be that low if last period was neutral and some recent periods were
158      // not neutral.
159      step = maximumStepSize;
160    } else if ((newTuneDirection == StepDirection.INCREASE_MEMSTORE_SIZE
161        && decayingTunerStepSizeSum < 0) ||
162        (newTuneDirection == StepDirection.INCREASE_BLOCK_CACHE_SIZE
163        && decayingTunerStepSizeSum > 0)) {
164      // Current step is opposite of past tuner actions so decrease the step size to reach steady
165      // state.
166      if (!offheapMemstore && step != minimumStepSize) {
167        // we leave the step to be at minimumStepSize for offheap memstore
168        step = step / 2.00f;
169      }
170    }
171    if (step < minimumStepSize) {
172      // If step size is too small then we do nothing.
173      LOG.debug("Tuner step size is too low; we will not perform any tuning this time.");
174      step = 0.0f;
175      newTuneDirection = StepDirection.NEUTRAL;
176    }
177    // There are no flushes due to onheap pressure and
178    // we have an offheap memstore and we are in need of more block_cache size.
179    if (totalOnheapFlushCount == 0 && offheapMemstore
180        && newTuneDirection == StepDirection.INCREASE_BLOCK_CACHE_SIZE) {
181      // we are sure that there are flushes only due to offheap pressure
182      // So don't do the memstore decrease equal to the step size. Instead do minimum stepSize
183      // decrease. But even if we have some flushes due to heap then it is better we tune
184      // the existing way.
185      step = minimumStepSize;
186    }
187    // Increase / decrease the memstore / block cache sizes depending on new tuner step.
188    // We don't want to exert immediate pressure on memstore. So, we decrease its size gracefully;
189    // we set a minimum bar in the middle of the total memstore size and the lower limit.
190    float minMemstoreSize = ((globalMemStoreLimitLowMarkPercent + 1) * curMemstoreSize) / 2.00f;
191
192    switch (newTuneDirection) {
193    case INCREASE_BLOCK_CACHE_SIZE:
194        if (curMemstoreSize - step < minMemstoreSize) {
195          step = curMemstoreSize - minMemstoreSize;
196        }
197        newMemstoreSize = curMemstoreSize - step;
198        newBlockCacheSize = curBlockCacheSize + step;
199        rollingStatsForTunerSteps.insertDataValue(-(int)(step*100000));
200        decayingTunerStepSizeSum = (decayingTunerStepSizeSum - step)/2.00f;
201        break;
202    case INCREASE_MEMSTORE_SIZE:
203        newBlockCacheSize = curBlockCacheSize - step;
204        newMemstoreSize = curMemstoreSize + step;
205        rollingStatsForTunerSteps.insertDataValue((int)(step*100000));
206        decayingTunerStepSizeSum = (decayingTunerStepSizeSum + step)/2.00f;
207        break;
208    default:
209        prevTuneDirection = StepDirection.NEUTRAL;
210        rollingStatsForTunerSteps.insertDataValue(0);
211        decayingTunerStepSizeSum = (decayingTunerStepSizeSum)/2.00f;
212        return NO_OP_TUNER_RESULT;
213    }
214    // Check we are within max/min bounds.
215    if (newMemstoreSize > globalMemStorePercentMaxRange) {
216      newMemstoreSize = globalMemStorePercentMaxRange;
217    } else if (newMemstoreSize < globalMemStorePercentMinRange) {
218      newMemstoreSize = globalMemStorePercentMinRange;
219    }
220    if (newBlockCacheSize > blockCachePercentMaxRange) {
221      newBlockCacheSize = blockCachePercentMaxRange;
222    } else if (newBlockCacheSize < blockCachePercentMinRange) {
223      newBlockCacheSize = blockCachePercentMinRange;
224    }
225    TUNER_RESULT.setBlockCacheSize(newBlockCacheSize);
226    TUNER_RESULT.setMemStoreSize(newMemstoreSize);
227    prevTuneDirection = newTuneDirection;
228    return TUNER_RESULT;
229  }
230
231  /**
232   * Determine best direction of tuning base on given context.
233   * @param context The tuner context.
234   * @return tuning direction.
235   */
236  private StepDirection getTuneDirection(TunerContext context) {
237    StepDirection newTuneDirection = StepDirection.NEUTRAL;
238    long blockedFlushCount = context.getBlockedFlushCount();
239    long unblockedFlushCount = context.getUnblockedFlushCount();
240    long evictCount = context.getEvictCount();
241    long cacheMissCount = context.getCacheMissCount();
242    long totalFlushCount = blockedFlushCount + unblockedFlushCount;
243    float curMemstoreSize = context.getCurMemStoreSize();
244    float curBlockCacheSize = context.getCurBlockCacheSize();
245    StringBuilder tunerLog = new StringBuilder();
246    // We can consider memstore or block cache to be sufficient if
247    // we are using only a minor fraction of what have been already provided to it.
248    boolean earlyMemstoreSufficientCheck = totalFlushCount == 0
249        || context.getCurMemStoreUsed() < curMemstoreSize * sufficientMemoryLevel;
250    boolean earlyBlockCacheSufficientCheck = evictCount == 0 ||
251        context.getCurBlockCacheUsed() < curBlockCacheSize * sufficientMemoryLevel;
252    if (earlyMemstoreSufficientCheck && earlyBlockCacheSufficientCheck) {
253      // Both memstore and block cache memory seems to be sufficient. No operation required.
254      newTuneDirection = StepDirection.NEUTRAL;
255    } else if (earlyMemstoreSufficientCheck) {
256      // Increase the block cache size and corresponding decrease in memstore size.
257      newTuneDirection = StepDirection.INCREASE_BLOCK_CACHE_SIZE;
258    } else if (earlyBlockCacheSufficientCheck) {
259      // Increase the memstore size and corresponding decrease in block cache size.
260      newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
261    } else {
262      // Early checks for sufficient memory failed. Tuning memory based on past statistics.
263      // Boolean indicator to show if we need to revert previous step or not.
264      boolean isReverting = false;
265      switch (prevTuneDirection) {
266      // Here we are using number of evictions rather than cache misses because it is more
267      // strong indicator for deficient cache size. Improving caching is what we
268      // would like to optimize for in steady state.
269      case INCREASE_BLOCK_CACHE_SIZE:
270        if ((double)evictCount > rollingStatsForEvictions.getMean() ||
271            (double)totalFlushCount > rollingStatsForFlushes.getMean() +
272                rollingStatsForFlushes.getDeviation()/2.00) {
273          // Reverting previous step as it was not useful.
274          // Tuning failed to decrease evictions or tuning resulted in large number of flushes.
275          newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
276          tunerLog.append("We will revert previous tuning");
277          if ((double)evictCount > rollingStatsForEvictions.getMean()) {
278            tunerLog.append(" because we could not decrease evictions sufficiently.");
279          } else {
280            tunerLog.append(" because the number of flushes rose significantly.");
281          }
282          isReverting = true;
283        }
284        break;
285      case INCREASE_MEMSTORE_SIZE:
286        if ((double)totalFlushCount > rollingStatsForFlushes.getMean() ||
287            (double)evictCount > rollingStatsForEvictions.getMean() +
288                rollingStatsForEvictions.getDeviation()/2.00) {
289          // Reverting previous step as it was not useful.
290          // Tuning failed to decrease flushes or tuning resulted in large number of evictions.
291          newTuneDirection = StepDirection.INCREASE_BLOCK_CACHE_SIZE;
292          tunerLog.append("We will revert previous tuning");
293          if ((double)totalFlushCount > rollingStatsForFlushes.getMean()) {
294            tunerLog.append(" because we could not decrease flushes sufficiently.");
295          } else {
296            tunerLog.append(" because number of evictions rose significantly.");
297          }
298          isReverting = true;
299        }
300        break;
301      default:
302        // Last step was neutral, revert doesn't not apply here.
303        break;
304      }
305      // If we are not reverting. We try to tune memory sizes by looking at cache misses / flushes.
306      if (!isReverting){
307        // mean +- deviation*0.8 is considered to be normal
308        // below it its consider low and above it is considered high.
309        // We can safely assume that the number cache misses, flushes are normally distributed over
310        // past periods and hence on all the above mentioned classes (normal, high and low)
311        // are likely to occur with probability 56%, 22%, 22% respectively. Hence there is at
312        // least ~10% probability that we will not fall in NEUTRAL step.
313        // This optimization solution is feedback based and we revert when we
314        // dont find our steps helpful. Hence we want to do tuning only when we have clear
315        // indications because too many unnecessary tuning may affect the performance of cluster.
316        if ((double)cacheMissCount < rollingStatsForCacheMisses.getMean() -
317            rollingStatsForCacheMisses.getDeviation()*0.80 &&
318            (double)totalFlushCount < rollingStatsForFlushes.getMean() -
319                rollingStatsForFlushes.getDeviation()*0.80) {
320          // Everything is fine no tuning required
321          newTuneDirection = StepDirection.NEUTRAL;
322        } else if ((double)cacheMissCount > rollingStatsForCacheMisses.getMean() +
323            rollingStatsForCacheMisses.getDeviation()*0.80 &&
324            (double)totalFlushCount < rollingStatsForFlushes.getMean() -
325                rollingStatsForFlushes.getDeviation()*0.80) {
326          // more misses , increasing cache size
327          newTuneDirection = StepDirection.INCREASE_BLOCK_CACHE_SIZE;
328          tunerLog.append(
329              "Going to increase block cache size due to increase in number of cache misses.");
330        } else if ((double)cacheMissCount < rollingStatsForCacheMisses.getMean() -
331            rollingStatsForCacheMisses.getDeviation()*0.80 &&
332            (double)totalFlushCount > rollingStatsForFlushes.getMean() +
333                rollingStatsForFlushes.getDeviation()*0.80) {
334          // more flushes , increasing memstore size
335          newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
336          tunerLog.append("Going to increase memstore size due to increase in number of flushes.");
337        } else if (blockedFlushCount > 0 && prevTuneDirection == StepDirection.NEUTRAL) {
338          // we do not want blocked flushes
339          newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
340          tunerLog.append("Going to increase memstore size due to"
341              + blockedFlushCount + " blocked flushes.");
342        } else {
343          // Default. Not enough facts to do tuning.
344          tunerLog.append("Going to do nothing because we "
345              + "could not determine best tuning direction");
346          newTuneDirection = StepDirection.NEUTRAL;
347        }
348      }
349    }
350    if (LOG.isDebugEnabled()) {
351      LOG.debug(tunerLog.toString());
352    }
353    return newTuneDirection;
354  }
355
356  /**
357   * Add the given context to the rolling tuner stats.
358   * @param context The tuner context.
359   */
360  private void addToRollingStats(TunerContext context) {
361    rollingStatsForCacheMisses.insertDataValue(context.getCacheMissCount());
362    rollingStatsForFlushes
363        .insertDataValue(context.getBlockedFlushCount() + context.getUnblockedFlushCount());
364    rollingStatsForEvictions.insertDataValue(context.getEvictCount());
365  }
366
367  @Override
368  public Configuration getConf() {
369    return this.conf;
370  }
371
372  @Override
373  public void setConf(Configuration conf) {
374    this.conf = conf;
375    this.maximumStepSize = conf.getFloat(MAX_STEP_KEY, DEFAULT_MAX_STEP_VALUE);
376    this.minimumStepSize = conf.getFloat(MIN_STEP_KEY, DEFAULT_MIN_STEP_VALUE);
377    this.step = this.maximumStepSize;
378    this.sufficientMemoryLevel = conf.getFloat(SUFFICIENT_MEMORY_LEVEL_KEY,
379        DEFAULT_SUFFICIENT_MEMORY_LEVEL_VALUE);
380    this.tunerLookupPeriods = conf.getInt(LOOKUP_PERIODS_KEY, DEFAULT_LOOKUP_PERIODS);
381    this.blockCachePercentMinRange = conf.getFloat(BLOCK_CACHE_SIZE_MIN_RANGE_KEY,
382        conf.getFloat(HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT));
383    this.blockCachePercentMaxRange = conf.getFloat(BLOCK_CACHE_SIZE_MAX_RANGE_KEY,
384        conf.getFloat(HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT));
385    this.globalMemStorePercentMinRange = conf.getFloat(MEMSTORE_SIZE_MIN_RANGE_KEY,
386        MemorySizeUtil.getGlobalMemStoreHeapPercent(conf, false));
387    this.globalMemStorePercentMaxRange = conf.getFloat(MEMSTORE_SIZE_MAX_RANGE_KEY,
388        MemorySizeUtil.getGlobalMemStoreHeapPercent(conf, false));
389    this.globalMemStoreLimitLowMarkPercent = MemorySizeUtil.getGlobalMemStoreHeapLowerMark(conf,
390        true);
391    // Default value of periods to ignore is number of lookup periods
392    this.numPeriodsToIgnore = conf.getInt(NUM_PERIODS_TO_IGNORE, this.tunerLookupPeriods);
393    this.rollingStatsForCacheMisses = new RollingStatCalculator(this.tunerLookupPeriods);
394    this.rollingStatsForFlushes = new RollingStatCalculator(this.tunerLookupPeriods);
395    this.rollingStatsForEvictions = new RollingStatCalculator(this.tunerLookupPeriods);
396    this.rollingStatsForTunerSteps = new RollingStatCalculator(this.tunerLookupPeriods);
397  }
398
399  private enum StepDirection{
400    // block cache size was increased
401    INCREASE_BLOCK_CACHE_SIZE,
402    // memstore size was increased
403    INCREASE_MEMSTORE_SIZE,
404    // no operation was performed
405    NEUTRAL
406  }
407}