001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.util.ArrayList;
022import java.util.HashSet;
023import java.util.LinkedList;
024import java.util.List;
025import java.util.Set;
026import org.apache.commons.lang3.RandomUtils;
027import org.apache.hadoop.hbase.ClusterMetrics;
028import org.apache.hadoop.hbase.ServerName;
029import org.junit.Assert;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033/** This action is too specific to put in ChaosMonkey; put it here */
034public class UnbalanceKillAndRebalanceAction extends Action {
035  private static final Logger LOG =
036      LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class);
037  /** Fractions of servers to get regions and live and die respectively; from all other
038   * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */
039  private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
040  private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
041  private static final double HOARD_FRC_OF_REGIONS = 0.8;
042  /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance
043   * and restarting the servers; to make sure these events have time to impact the cluster. */
044  private final long waitForUnbalanceMilliSec;
045  private final long waitForKillsMilliSec;
046  private final long waitAfterBalanceMilliSec;
047  private final boolean killMetaRs;
048
049  public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance,
050      boolean killMetaRs) {
051    super();
052    waitForUnbalanceMilliSec = waitUnbalance;
053    waitForKillsMilliSec = waitKill;
054    waitAfterBalanceMilliSec = waitAfterBalance;
055    this.killMetaRs = killMetaRs;
056  }
057
058  @Override protected Logger getLogger() {
059    return LOG;
060  }
061
062  @Override
063  public void perform() throws Exception {
064    ClusterMetrics status = this.cluster.getClusterMetrics();
065    List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet());
066    Set<ServerName> killedServers = new HashSet<>();
067
068    int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
069    int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
070    Assert.assertTrue(
071        "There are not enough victim servers: " + victimServers.size(),
072        liveCount + deadCount < victimServers.size());
073    List<ServerName> targetServers = new ArrayList<>(liveCount);
074    for (int i = 0; i < liveCount + deadCount; ++i) {
075      int victimIx = RandomUtils.nextInt(0, victimServers.size());
076      targetServers.add(victimServers.remove(victimIx));
077    }
078    unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
079    Thread.sleep(waitForUnbalanceMilliSec);
080    ServerName metaServer = cluster.getServerHoldingMeta();
081    for (ServerName targetServer: targetServers) {
082      // Don't keep killing servers if we're
083      // trying to stop the monkey.
084      if (context.isStopping()) {
085        break;
086      }
087      if (killedServers.size() >= liveCount) {
088        break;
089      }
090
091      if (!killMetaRs && targetServer.equals(metaServer)) {
092        getLogger().info("Not killing server because it holds hbase:meta.");
093      } else {
094        killRs(targetServer);
095        killedServers.add(targetServer);
096      }
097    }
098
099    Thread.sleep(waitForKillsMilliSec);
100    forceBalancer();
101    Thread.sleep(waitAfterBalanceMilliSec);
102    for (ServerName server:killedServers) {
103      startRs(server);
104    }
105  }
106}