001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.chaos.actions;
019
020import java.util.ArrayList;
021import java.util.HashSet;
022import java.util.LinkedList;
023import java.util.List;
024import java.util.Random;
025import java.util.Set;
026import java.util.concurrent.ThreadLocalRandom;
027import org.apache.hadoop.hbase.ClusterMetrics;
028import org.apache.hadoop.hbase.ServerName;
029import org.junit.Assert;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033/** This action is too specific to put in ChaosMonkey; put it here */
034public class UnbalanceKillAndRebalanceAction extends Action {
035  private static final Logger LOG = LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class);
036  /**
037   * Fractions of servers to get regions and live and die respectively; from all other servers,
038   * HOARD_FRC_OF_REGIONS will be removed to the above randomly
039   */
040  private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
041  private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
042  private static final double HOARD_FRC_OF_REGIONS = 0.8;
043  /**
044   * Waits between calling unbalance and killing servers, kills and rebalance, and rebalance and
045   * restarting the servers; to make sure these events have time to impact the cluster.
046   */
047  private final long waitForUnbalanceMilliSec;
048  private final long waitForKillsMilliSec;
049  private final long waitAfterBalanceMilliSec;
050  private final boolean killMetaRs;
051
052  public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance,
053    boolean killMetaRs) {
054    super();
055    waitForUnbalanceMilliSec = waitUnbalance;
056    waitForKillsMilliSec = waitKill;
057    waitAfterBalanceMilliSec = waitAfterBalance;
058    this.killMetaRs = killMetaRs;
059  }
060
061  @Override
062  protected Logger getLogger() {
063    return LOG;
064  }
065
066  @Override
067  public void perform() throws Exception {
068    ClusterMetrics status = this.cluster.getClusterMetrics();
069    List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet());
070    Set<ServerName> killedServers = new HashSet<>();
071    int liveCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
072    int deadCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
073    Assert.assertTrue("There are not enough victim servers: " + victimServers.size(),
074      liveCount + deadCount < victimServers.size());
075    Random rand = ThreadLocalRandom.current();
076    List<ServerName> targetServers = new ArrayList<>(liveCount);
077    for (int i = 0; i < liveCount + deadCount; ++i) {
078      int victimIx = rand.nextInt(victimServers.size());
079      targetServers.add(victimServers.remove(victimIx));
080    }
081    unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
082    Thread.sleep(waitForUnbalanceMilliSec);
083    ServerName metaServer = cluster.getServerHoldingMeta();
084    for (ServerName targetServer : targetServers) {
085      // Don't keep killing servers if we're
086      // trying to stop the monkey.
087      if (context.isStopping()) {
088        break;
089      }
090      if (killedServers.size() >= liveCount) {
091        break;
092      }
093
094      if (!killMetaRs && targetServer.equals(metaServer)) {
095        getLogger().info("Not killing server because it holds hbase:meta.");
096      } else {
097        killRs(targetServer);
098        killedServers.add(targetServer);
099      }
100    }
101
102    Thread.sleep(waitForKillsMilliSec);
103    forceBalancer();
104    Thread.sleep(waitAfterBalanceMilliSec);
105    for (ServerName server : killedServers) {
106      startRs(server);
107    }
108  }
109}