001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.chaos.actions;
019
020import java.util.ArrayList;
021import java.util.HashSet;
022import java.util.List;
023import java.util.Random;
024import java.util.Set;
025import java.util.concurrent.ThreadLocalRandom;
026import org.apache.hadoop.hbase.ClusterMetrics;
027import org.apache.hadoop.hbase.ServerName;
028import org.junit.Assert;
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032/** This action is too specific to put in ChaosMonkey; put it here */
033public class UnbalanceKillAndRebalanceAction extends Action {
034  private static final Logger LOG = LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class);
035  /**
036   * Fractions of servers to get regions and live and die respectively; from all other servers,
037   * HOARD_FRC_OF_REGIONS will be removed to the above randomly
038   */
039  private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
040  private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
041  private static final double HOARD_FRC_OF_REGIONS = 0.8;
042  /**
043   * Waits between calling unbalance and killing servers, kills and rebalance, and rebalance and
044   * restarting the servers; to make sure these events have time to impact the cluster.
045   */
046  private final long waitForUnbalanceMilliSec;
047  private final long waitForKillsMilliSec;
048  private final long waitAfterBalanceMilliSec;
049  private final boolean killMetaRs;
050
051  public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance,
052    boolean killMetaRs) {
053    super();
054    waitForUnbalanceMilliSec = waitUnbalance;
055    waitForKillsMilliSec = waitKill;
056    waitAfterBalanceMilliSec = waitAfterBalance;
057    this.killMetaRs = killMetaRs;
058  }
059
060  @Override
061  protected Logger getLogger() {
062    return LOG;
063  }
064
065  @Override
066  public void perform() throws Exception {
067    ClusterMetrics status = this.cluster.getClusterMetrics();
068    List<ServerName> victimServers = new ArrayList<>(status.getLiveServerMetrics().keySet());
069    Set<ServerName> killedServers = new HashSet<>();
070    int liveCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
071    int deadCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
072    Assert.assertTrue("There are not enough victim servers: " + victimServers.size(),
073      liveCount + deadCount < victimServers.size());
074    Random rand = ThreadLocalRandom.current();
075    List<ServerName> targetServers = new ArrayList<>(liveCount);
076    for (int i = 0; i < liveCount + deadCount; ++i) {
077      int victimIx = rand.nextInt(victimServers.size());
078      targetServers.add(victimServers.remove(victimIx));
079    }
080    unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
081    Thread.sleep(waitForUnbalanceMilliSec);
082    ServerName metaServer = cluster.getServerHoldingMeta();
083    for (ServerName targetServer : targetServers) {
084      // Don't keep killing servers if we're
085      // trying to stop the monkey.
086      if (context.isStopping()) {
087        break;
088      }
089      if (killedServers.size() >= liveCount) {
090        break;
091      }
092
093      if (!killMetaRs && targetServer.equals(metaServer)) {
094        getLogger().info("Not killing server because it holds hbase:meta.");
095      } else {
096        killRs(targetServer);
097        killedServers.add(targetServer);
098      }
099    }
100
101    Thread.sleep(waitForKillsMilliSec);
102    forceBalancer();
103    Thread.sleep(waitAfterBalanceMilliSec);
104    for (ServerName server : killedServers) {
105      startRs(server);
106    }
107  }
108}