001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.util.ArrayList;
022import java.util.HashSet;
023import java.util.LinkedList;
024import java.util.List;
025import java.util.Set;
026
027import org.apache.commons.lang3.RandomUtils;
028import org.apache.hadoop.hbase.ClusterMetrics;
029import org.apache.hadoop.hbase.ServerName;
030import org.junit.Assert;
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034/** This action is too specific to put in ChaosMonkey; put it here */
035public class UnbalanceKillAndRebalanceAction extends Action {
036  private static final Logger LOG =
037      LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class);
038  /** Fractions of servers to get regions and live and die respectively; from all other
039   * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */
040  private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
041  private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
042  private static final double HOARD_FRC_OF_REGIONS = 0.8;
043  /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance
044   * and restarting the servers; to make sure these events have time to impact the cluster. */
045  private long waitForUnbalanceMilliSec;
046  private long waitForKillsMilliSec;
047  private long waitAfterBalanceMilliSec;
048  private boolean killMetaRs;
049
050  public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance,
051      boolean killMetaRs) {
052    super();
053    waitForUnbalanceMilliSec = waitUnbalance;
054    waitForKillsMilliSec = waitKill;
055    waitAfterBalanceMilliSec = waitAfterBalance;
056    this.killMetaRs = killMetaRs;
057  }
058
059  @Override
060  public void perform() throws Exception {
061    ClusterMetrics status = this.cluster.getClusterMetrics();
062    List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet());
063    Set<ServerName> killedServers = new HashSet<>();
064
065    int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
066    int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
067    Assert.assertTrue(
068        "There are not enough victim servers: " + victimServers.size(),
069        liveCount + deadCount < victimServers.size());
070    List<ServerName> targetServers = new ArrayList<>(liveCount);
071    for (int i = 0; i < liveCount + deadCount; ++i) {
072      int victimIx = RandomUtils.nextInt(0, victimServers.size());
073      targetServers.add(victimServers.remove(victimIx));
074    }
075    unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
076    Thread.sleep(waitForUnbalanceMilliSec);
077    ServerName metaServer = cluster.getServerHoldingMeta();
078    for (ServerName targetServer: targetServers) {
079      // Don't keep killing servers if we're
080      // trying to stop the monkey.
081      if (context.isStopping()) {
082        break;
083      }
084      if (killedServers.size() >= liveCount) {
085        break;
086      }
087
088      if (!killMetaRs && targetServer.equals(metaServer)) {
089        LOG.info("Not killing server because it holds hbase:meta.");
090      } else {
091        killRs(targetServer);
092        killedServers.add(targetServer);
093      }
094    }
095
096    Thread.sleep(waitForKillsMilliSec);
097    forceBalancer();
098    Thread.sleep(waitAfterBalanceMilliSec);
099    for (ServerName server:killedServers) {
100      startRs(server);
101    }
102  }
103}