001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.apache.hadoop.hbase.HConstants.HFILE_BLOCK_CACHE_SIZE_KEY;
021import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.BLOCK_CACHE_SIZE_MAX_RANGE_KEY;
022import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.BLOCK_CACHE_SIZE_MIN_RANGE_KEY;
023import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.MEMSTORE_SIZE_MAX_RANGE_KEY;
024import static org.apache.hadoop.hbase.regionserver.HeapMemoryManager.MEMSTORE_SIZE_MIN_RANGE_KEY;
025
026import org.apache.hadoop.conf.Configuration;
027import org.apache.hadoop.hbase.HConstants;
028import org.apache.hadoop.hbase.io.util.MemorySizeUtil;
029import org.apache.hadoop.hbase.regionserver.HeapMemoryManager.TunerContext;
030import org.apache.hadoop.hbase.regionserver.HeapMemoryManager.TunerResult;
031import org.apache.hadoop.hbase.util.RollingStatCalculator;
032import org.apache.yetus.audience.InterfaceAudience;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036/**
037 * The default implementation for the HeapMemoryTuner. This will do statistical checks on number of
038 * evictions, cache misses and flushes to decide whether there should be changes in the heap size of
039 * memstore/block cache. During each tuner operation tuner takes a step which can either be
040 * INCREASE_BLOCK_CACHE_SIZE (increase block cache size), INCREASE_MEMSTORE_SIZE (increase memstore
041 * size) and by default it is NEUTRAL (no change). We say block cache is sufficient when there is no
042 * block cache eviction at all or major amount of memory allocated to block cache is empty,
043 * similarly we say memory allocated for memstore is sufficient when there is no memstore flushes
044 * because of heap pressure or major amount of memory allocated to memstore is empty. If both are
045 * sufficient we do nothing, if exactly one of them is found to be sufficient we decrease its size
046 * by <i>step</i> and increase the other by same amount. If none of them is sufficient we do
047 * statistical analysis on number of cache misses and flushes to determine tuner direction. Based on
048 * these statistics we decide the tuner direction. If we are not confident about which step
049 * direction to take we do nothing and wait for next iteration. On expectation we will be tuning for
050 * at least 10% tuner calls. The number of past periods to consider for statistics calculation can
051 * be specified in config by <i>hbase.regionserver.heapmemory.autotuner.lookup.periods</i>. Also
052 * these many initial calls to tuner will be ignored (cache is warming up and we leave the system to
053 * reach steady state). After the tuner takes a step, in next call we insure that last call was
054 * indeed helpful and did not do us any harm. If not then we revert the previous step. The step size
055 * is dynamic and it changes based on current and past few tuning directions and their step sizes.
056 * We maintain a parameter <i>decayingAvgTunerStepSize</i> which is sum of past tuner steps with
057 * sign(positive for increase in memstore and negative for increase in block cache). But rather than
058 * simple sum it is calculated by giving more priority to the recent tuning steps. When last few
059 * tuner steps were NETURAL then we assume we are restarting the tuning process and step size is
060 * updated to maximum allowed size which can be specified in config by
061 * <i>hbase.regionserver.heapmemory.autotuner.step.max</i>. If in a particular tuning operation the
062 * step direction is opposite to what indicated by <i>decayingTunerStepSizeSum</i> we decrease the
063 * step size by half. Step size does not change in other tuning operations. When step size gets
064 * below a certain threshold then the following tuner operations are considered to be neutral. The
065 * minimum step size can be specified in config by
066 * <i>hbase.regionserver.heapmemory.autotuner.step.min</i>.
067 */
068@InterfaceAudience.Private
069class DefaultHeapMemoryTuner implements HeapMemoryTuner {
070  public static final String MAX_STEP_KEY = "hbase.regionserver.heapmemory.autotuner.step.max";
071  public static final String MIN_STEP_KEY = "hbase.regionserver.heapmemory.autotuner.step.min";
072  public static final String SUFFICIENT_MEMORY_LEVEL_KEY =
073    "hbase.regionserver.heapmemory.autotuner.sufficient.memory.level";
074  public static final String LOOKUP_PERIODS_KEY =
075    "hbase.regionserver.heapmemory.autotuner.lookup.periods";
076  public static final String NUM_PERIODS_TO_IGNORE =
077    "hbase.regionserver.heapmemory.autotuner.ignored.periods";
078  // Maximum step size that the tuner can take
079  public static final float DEFAULT_MAX_STEP_VALUE = 0.04f; // 4%
080  // Minimum step size that the tuner can take
081  public static final float DEFAULT_MIN_STEP_VALUE = 0.00125f; // 0.125%
082  // If current block cache size or memstore size in use is below this level relative to memory
083  // provided to it then corresponding component will be considered to have sufficient memory
084  public static final float DEFAULT_SUFFICIENT_MEMORY_LEVEL_VALUE = 0.5f; // 50%
085  // Number of tuner periods that will be considered while calculating mean and deviation
086  // If set to zero, all stats will be calculated from the start
087  public static final int DEFAULT_LOOKUP_PERIODS = 60;
088  public static final int DEFAULT_NUM_PERIODS_IGNORED = 60;
089  private static final TunerResult NO_OP_TUNER_RESULT = new TunerResult(false);
090  // If deviation of tuner step size gets below this value then it means past few periods were
091  // NEUTRAL(given that last tuner period was also NEUTRAL).
092  private static final double TUNER_STEP_EPS = 1e-6;
093
094  private Logger LOG = LoggerFactory.getLogger(DefaultHeapMemoryTuner.class);
095  private TunerResult TUNER_RESULT = new TunerResult(true);
096  private Configuration conf;
097  private float sufficientMemoryLevel = DEFAULT_SUFFICIENT_MEMORY_LEVEL_VALUE;
098  private float maximumStepSize = DEFAULT_MAX_STEP_VALUE;
099  private float minimumStepSize = DEFAULT_MIN_STEP_VALUE;
100  private int tunerLookupPeriods = DEFAULT_LOOKUP_PERIODS;
101  private int numPeriodsToIgnore = DEFAULT_NUM_PERIODS_IGNORED;
102  // Counter to ignore few initial periods while cache is still warming up
103  // Memory tuner will do no operation for the first "tunerLookupPeriods"
104  private int ignoreInitialPeriods = 0;
105
106  private float globalMemStorePercentMinRange;
107  private float globalMemStorePercentMaxRange;
108  private float blockCachePercentMinRange;
109  private float blockCachePercentMaxRange;
110
111  private float globalMemStoreLimitLowMarkPercent;
112
113  // Store statistics about the corresponding parameters for memory tuning
114  private RollingStatCalculator rollingStatsForCacheMisses;
115  private RollingStatCalculator rollingStatsForFlushes;
116  private RollingStatCalculator rollingStatsForEvictions;
117  private RollingStatCalculator rollingStatsForTunerSteps;
118  // Set step size to max value for tuning, this step size will adjust dynamically while tuning
119  private float step = DEFAULT_MAX_STEP_VALUE;
120  private StepDirection prevTuneDirection = StepDirection.NEUTRAL;
121  // positive means memstore's size was increased
122  // It is not just arithmetic sum of past tuner periods. More priority is given to recent
123  // tuning steps.
124  private double decayingTunerStepSizeSum = 0;
125
126  @Override
127  public TunerResult tune(TunerContext context) {
128    float curMemstoreSize = context.getCurMemStoreSize();
129    float curBlockCacheSize = context.getCurBlockCacheSize();
130    addToRollingStats(context);
131
132    if (ignoreInitialPeriods < numPeriodsToIgnore) {
133      // Ignoring the first few tuner periods
134      ignoreInitialPeriods++;
135      rollingStatsForTunerSteps.insertDataValue(0);
136      LOG.info("Ignoring initial tuning periods: {} so far, {} to ignore", ignoreInitialPeriods,
137        numPeriodsToIgnore);
138      return NO_OP_TUNER_RESULT;
139    }
140    StepDirection newTuneDirection = getTuneDirection(context);
141
142    long blockedFlushCount = context.getBlockedFlushCount();
143    long unblockedFlushCount = context.getUnblockedFlushCount();
144    long totalOnheapFlushCount = blockedFlushCount + unblockedFlushCount;
145    boolean offheapMemstore = context.isOffheapMemStore();
146    float newMemstoreSize;
147    float newBlockCacheSize;
148
149    // Adjusting step size for tuning to get to steady state or restart from steady state.
150    // Even if the step size was 4% and 32 GB memory size, we will be shifting 1 GB back and forth
151    // per tuner operation and it can affect the performance of cluster so we keep on decreasing
152    // step size until everything settles.
153    if (
154      prevTuneDirection == StepDirection.NEUTRAL && newTuneDirection != StepDirection.NEUTRAL
155        && rollingStatsForTunerSteps.getDeviation() < TUNER_STEP_EPS
156    ) {
157      // Restarting the tuning from steady state and setting step size to maximum.
158      // The deviation cannot be that low if last period was neutral and some recent periods were
159      // not neutral.
160      step = maximumStepSize;
161    } else if (
162      (newTuneDirection == StepDirection.INCREASE_MEMSTORE_SIZE && decayingTunerStepSizeSum < 0)
163        || (newTuneDirection == StepDirection.INCREASE_BLOCK_CACHE_SIZE
164          && decayingTunerStepSizeSum > 0)
165    ) {
166      // Current step is opposite of past tuner actions so decrease the step size to reach steady
167      // state.
168      if (!offheapMemstore && step != minimumStepSize) {
169        // we leave the step to be at minimumStepSize for offheap memstore
170        step = step / 2.00f;
171      }
172    }
173    if (step < minimumStepSize) {
174      // If step size is too small then we do nothing.
175      LOG.debug("Tuner step size is too low; we will not perform any tuning this time.");
176      step = 0.0f;
177      newTuneDirection = StepDirection.NEUTRAL;
178    }
179    // There are no flushes due to onheap pressure and
180    // we have an offheap memstore and we are in need of more block_cache size.
181    if (
182      totalOnheapFlushCount == 0 && offheapMemstore
183        && newTuneDirection == StepDirection.INCREASE_BLOCK_CACHE_SIZE
184    ) {
185      // we are sure that there are flushes only due to offheap pressure
186      // So don't do the memstore decrease equal to the step size. Instead do minimum stepSize
187      // decrease. But even if we have some flushes due to heap then it is better we tune
188      // the existing way.
189      step = minimumStepSize;
190    }
191    // Increase / decrease the memstore / block cache sizes depending on new tuner step.
192    // We don't want to exert immediate pressure on memstore. So, we decrease its size gracefully;
193    // we set a minimum bar in the middle of the total memstore size and the lower limit.
194    float minMemstoreSize = ((globalMemStoreLimitLowMarkPercent + 1) * curMemstoreSize) / 2.00f;
195
196    switch (newTuneDirection) {
197      case INCREASE_BLOCK_CACHE_SIZE:
198        if (curMemstoreSize - step < minMemstoreSize) {
199          step = curMemstoreSize - minMemstoreSize;
200        }
201        newMemstoreSize = curMemstoreSize - step;
202        newBlockCacheSize = curBlockCacheSize + step;
203        rollingStatsForTunerSteps.insertDataValue(-(int) (step * 100000));
204        decayingTunerStepSizeSum = (decayingTunerStepSizeSum - step) / 2.00f;
205        break;
206      case INCREASE_MEMSTORE_SIZE:
207        newBlockCacheSize = curBlockCacheSize - step;
208        newMemstoreSize = curMemstoreSize + step;
209        rollingStatsForTunerSteps.insertDataValue((int) (step * 100000));
210        decayingTunerStepSizeSum = (decayingTunerStepSizeSum + step) / 2.00f;
211        break;
212      default:
213        prevTuneDirection = StepDirection.NEUTRAL;
214        rollingStatsForTunerSteps.insertDataValue(0);
215        decayingTunerStepSizeSum = (decayingTunerStepSizeSum) / 2.00f;
216        return NO_OP_TUNER_RESULT;
217    }
218    // Check we are within max/min bounds.
219    if (newMemstoreSize > globalMemStorePercentMaxRange) {
220      newMemstoreSize = globalMemStorePercentMaxRange;
221    } else if (newMemstoreSize < globalMemStorePercentMinRange) {
222      newMemstoreSize = globalMemStorePercentMinRange;
223    }
224    if (newBlockCacheSize > blockCachePercentMaxRange) {
225      newBlockCacheSize = blockCachePercentMaxRange;
226    } else if (newBlockCacheSize < blockCachePercentMinRange) {
227      newBlockCacheSize = blockCachePercentMinRange;
228    }
229    TUNER_RESULT.setBlockCacheSize(newBlockCacheSize);
230    TUNER_RESULT.setMemStoreSize(newMemstoreSize);
231    prevTuneDirection = newTuneDirection;
232    return TUNER_RESULT;
233  }
234
235  /**
236   * Determine best direction of tuning base on given context.
237   * @param context The tuner context.
238   * @return tuning direction.
239   */
240  private StepDirection getTuneDirection(TunerContext context) {
241    StepDirection newTuneDirection = StepDirection.NEUTRAL;
242    long blockedFlushCount = context.getBlockedFlushCount();
243    long unblockedFlushCount = context.getUnblockedFlushCount();
244    long evictCount = context.getEvictCount();
245    long cacheMissCount = context.getCacheMissCount();
246    long totalFlushCount = blockedFlushCount + unblockedFlushCount;
247    float curMemstoreSize = context.getCurMemStoreSize();
248    float curBlockCacheSize = context.getCurBlockCacheSize();
249    StringBuilder tunerLog = new StringBuilder();
250    // We can consider memstore or block cache to be sufficient if
251    // we are using only a minor fraction of what have been already provided to it.
252    boolean earlyMemstoreSufficientCheck = totalFlushCount == 0
253      || context.getCurMemStoreUsed() < curMemstoreSize * sufficientMemoryLevel;
254    boolean earlyBlockCacheSufficientCheck =
255      evictCount == 0 || context.getCurBlockCacheUsed() < curBlockCacheSize * sufficientMemoryLevel;
256    if (earlyMemstoreSufficientCheck && earlyBlockCacheSufficientCheck) {
257      // Both memstore and block cache memory seems to be sufficient. No operation required.
258      newTuneDirection = StepDirection.NEUTRAL;
259      tunerLog.append("Going to do nothing because no changes are needed.");
260    } else if (earlyMemstoreSufficientCheck) {
261      // Increase the block cache size and corresponding decrease in memstore size.
262      newTuneDirection = StepDirection.INCREASE_BLOCK_CACHE_SIZE;
263      tunerLog.append("Going to increase the block cache size.");
264    } else if (earlyBlockCacheSufficientCheck) {
265      // Increase the memstore size and corresponding decrease in block cache size.
266      newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
267      tunerLog.append("Going to increase the memstore size.");
268    } else {
269      // Early checks for sufficient memory failed. Tuning memory based on past statistics.
270      // Boolean indicator to show if we need to revert previous step or not.
271      boolean isReverting = false;
272      switch (prevTuneDirection) {
273        // Here we are using number of evictions rather than cache misses because it is more
274        // strong indicator for deficient cache size. Improving caching is what we
275        // would like to optimize for in steady state.
276        case INCREASE_BLOCK_CACHE_SIZE:
277          if (
278            (double) evictCount > rollingStatsForEvictions.getMean() || (double) totalFlushCount
279                > rollingStatsForFlushes.getMean() + rollingStatsForFlushes.getDeviation() / 2.00
280          ) {
281            // Reverting previous step as it was not useful.
282            // Tuning failed to decrease evictions or tuning resulted in large number of flushes.
283            newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
284            tunerLog.append("We will revert previous tuning");
285            if ((double) evictCount > rollingStatsForEvictions.getMean()) {
286              tunerLog.append(" because we could not decrease evictions sufficiently.");
287            } else {
288              tunerLog.append(" because the number of flushes rose significantly.");
289            }
290            isReverting = true;
291          }
292          break;
293        case INCREASE_MEMSTORE_SIZE:
294          if (
295            (double) totalFlushCount > rollingStatsForFlushes.getMean()
296              || (double) evictCount > rollingStatsForEvictions.getMean()
297                + rollingStatsForEvictions.getDeviation() / 2.00
298          ) {
299            // Reverting previous step as it was not useful.
300            // Tuning failed to decrease flushes or tuning resulted in large number of evictions.
301            newTuneDirection = StepDirection.INCREASE_BLOCK_CACHE_SIZE;
302            tunerLog.append("We will revert previous tuning");
303            if ((double) totalFlushCount > rollingStatsForFlushes.getMean()) {
304              tunerLog.append(" because we could not decrease flushes sufficiently.");
305            } else {
306              tunerLog.append(" because number of evictions rose significantly.");
307            }
308            isReverting = true;
309          }
310          break;
311        default:
312          // Last step was neutral, revert doesn't not apply here.
313          break;
314      }
315      // If we are not reverting. We try to tune memory sizes by looking at cache misses / flushes.
316      if (!isReverting) {
317        // mean +- deviation*0.8 is considered to be normal
318        // below it its consider low and above it is considered high.
319        // We can safely assume that the number cache misses, flushes are normally distributed over
320        // past periods and hence on all the above mentioned classes (normal, high and low)
321        // are likely to occur with probability 56%, 22%, 22% respectively. Hence there is at
322        // least ~10% probability that we will not fall in NEUTRAL step.
323        // This optimization solution is feedback based and we revert when we
324        // dont find our steps helpful. Hence we want to do tuning only when we have clear
325        // indications because too many unnecessary tuning may affect the performance of cluster.
326        if (
327          (double) cacheMissCount
328              < rollingStatsForCacheMisses.getMean()
329                - rollingStatsForCacheMisses.getDeviation() * 0.80
330            && (double) totalFlushCount
331                < rollingStatsForFlushes.getMean() - rollingStatsForFlushes.getDeviation() * 0.80
332        ) {
333          // Everything is fine no tuning required
334          newTuneDirection = StepDirection.NEUTRAL;
335        } else if (
336          (double) cacheMissCount
337              > rollingStatsForCacheMisses.getMean()
338                + rollingStatsForCacheMisses.getDeviation() * 0.80
339            && (double) totalFlushCount
340                < rollingStatsForFlushes.getMean() - rollingStatsForFlushes.getDeviation() * 0.80
341        ) {
342          // more misses , increasing cache size
343          newTuneDirection = StepDirection.INCREASE_BLOCK_CACHE_SIZE;
344          tunerLog.append(
345            "Going to increase block cache size due to increase in number of cache misses.");
346        } else if (
347          (double) cacheMissCount
348              < rollingStatsForCacheMisses.getMean()
349                - rollingStatsForCacheMisses.getDeviation() * 0.80
350            && (double) totalFlushCount
351                > rollingStatsForFlushes.getMean() + rollingStatsForFlushes.getDeviation() * 0.80
352        ) {
353          // more flushes , increasing memstore size
354          newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
355          tunerLog.append("Going to increase memstore size due to increase in number of flushes.");
356        } else if (blockedFlushCount > 0 && prevTuneDirection == StepDirection.NEUTRAL) {
357          // we do not want blocked flushes
358          newTuneDirection = StepDirection.INCREASE_MEMSTORE_SIZE;
359          tunerLog.append(
360            "Going to increase memstore size due to" + blockedFlushCount + " blocked flushes.");
361        } else {
362          // Default. Not enough facts to do tuning.
363          tunerLog.append(
364            "Going to do nothing because we " + "could not determine best tuning direction");
365          newTuneDirection = StepDirection.NEUTRAL;
366        }
367      }
368    }
369    // Log NEUTRAL decisions at DEBUG, because they are the most frequent and not that interesting.
370    // Log other decisions at INFO because they are making meaningful operational changes.
371    switch (newTuneDirection) {
372      case NEUTRAL:
373        if (LOG.isDebugEnabled()) {
374          LOG.debug(tunerLog.toString());
375        }
376        break;
377      default:
378        LOG.info(tunerLog.toString());
379        break;
380    }
381    return newTuneDirection;
382  }
383
384  /**
385   * Add the given context to the rolling tuner stats.
386   * @param context The tuner context.
387   */
388  private void addToRollingStats(TunerContext context) {
389    rollingStatsForCacheMisses.insertDataValue(context.getCacheMissCount());
390    rollingStatsForFlushes
391      .insertDataValue(context.getBlockedFlushCount() + context.getUnblockedFlushCount());
392    rollingStatsForEvictions.insertDataValue(context.getEvictCount());
393  }
394
395  @Override
396  public Configuration getConf() {
397    return this.conf;
398  }
399
400  @Override
401  public void setConf(Configuration conf) {
402    this.conf = conf;
403    this.maximumStepSize = conf.getFloat(MAX_STEP_KEY, DEFAULT_MAX_STEP_VALUE);
404    this.minimumStepSize = conf.getFloat(MIN_STEP_KEY, DEFAULT_MIN_STEP_VALUE);
405    this.step = this.maximumStepSize;
406    this.sufficientMemoryLevel =
407      conf.getFloat(SUFFICIENT_MEMORY_LEVEL_KEY, DEFAULT_SUFFICIENT_MEMORY_LEVEL_VALUE);
408    this.tunerLookupPeriods = conf.getInt(LOOKUP_PERIODS_KEY, DEFAULT_LOOKUP_PERIODS);
409    this.blockCachePercentMinRange = conf.getFloat(BLOCK_CACHE_SIZE_MIN_RANGE_KEY,
410      conf.getFloat(HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT));
411    this.blockCachePercentMaxRange = conf.getFloat(BLOCK_CACHE_SIZE_MAX_RANGE_KEY,
412      conf.getFloat(HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT));
413    this.globalMemStorePercentMinRange = conf.getFloat(MEMSTORE_SIZE_MIN_RANGE_KEY,
414      MemorySizeUtil.getGlobalMemStoreHeapPercent(conf, false));
415    this.globalMemStorePercentMaxRange = conf.getFloat(MEMSTORE_SIZE_MAX_RANGE_KEY,
416      MemorySizeUtil.getGlobalMemStoreHeapPercent(conf, false));
417    this.globalMemStoreLimitLowMarkPercent =
418      MemorySizeUtil.getGlobalMemStoreHeapLowerMark(conf, true);
419    // Default value of periods to ignore is number of lookup periods
420    this.numPeriodsToIgnore = conf.getInt(NUM_PERIODS_TO_IGNORE, this.tunerLookupPeriods);
421    this.rollingStatsForCacheMisses = new RollingStatCalculator(this.tunerLookupPeriods);
422    this.rollingStatsForFlushes = new RollingStatCalculator(this.tunerLookupPeriods);
423    this.rollingStatsForEvictions = new RollingStatCalculator(this.tunerLookupPeriods);
424    this.rollingStatsForTunerSteps = new RollingStatCalculator(this.tunerLookupPeriods);
425  }
426
427  private enum StepDirection {
428    // block cache size was increased
429    INCREASE_BLOCK_CACHE_SIZE,
430    // memstore size was increased
431    INCREASE_MEMSTORE_SIZE,
432    // no operation was performed
433    NEUTRAL
434  }
435}