001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.util.ArrayList;
022import java.util.HashSet;
023import java.util.LinkedList;
024import java.util.List;
025import java.util.Set;
026import org.apache.commons.lang3.RandomUtils;
027import org.apache.hadoop.hbase.ClusterMetrics;
028import org.apache.hadoop.hbase.ServerName;
029import org.junit.Assert;
030
031/** This action is too specific to put in ChaosMonkey; put it here */
032public class UnbalanceKillAndRebalanceAction extends Action {
033  /** Fractions of servers to get regions and live and die respectively; from all other
034   * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */
035  private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
036  private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
037  private static final double HOARD_FRC_OF_REGIONS = 0.8;
038  /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance
039   * and restarting the servers; to make sure these events have time to impact the cluster. */
040  private long waitForUnbalanceMilliSec;
041  private long waitForKillsMilliSec;
042  private long waitAfterBalanceMilliSec;
043  private boolean killMetaRs;
044
045  public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance,
046      boolean killMetaRs) {
047    super();
048    waitForUnbalanceMilliSec = waitUnbalance;
049    waitForKillsMilliSec = waitKill;
050    waitAfterBalanceMilliSec = waitAfterBalance;
051    this.killMetaRs = killMetaRs;
052  }
053
054  @Override
055  public void perform() throws Exception {
056    ClusterMetrics status = this.cluster.getClusterMetrics();
057    List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet());
058    Set<ServerName> killedServers = new HashSet<>();
059
060    int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
061    int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
062    Assert.assertTrue(
063        "There are not enough victim servers: " + victimServers.size(),
064        liveCount + deadCount < victimServers.size());
065    List<ServerName> targetServers = new ArrayList<>(liveCount);
066    for (int i = 0; i < liveCount + deadCount; ++i) {
067      int victimIx = RandomUtils.nextInt(0, victimServers.size());
068      targetServers.add(victimServers.remove(victimIx));
069    }
070    unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
071    Thread.sleep(waitForUnbalanceMilliSec);
072    ServerName metaServer = cluster.getServerHoldingMeta();
073    for (ServerName targetServer: targetServers) {
074      // Don't keep killing servers if we're
075      // trying to stop the monkey.
076      if (context.isStopping()) {
077        break;
078      }
079      if (killedServers.size() >= liveCount) {
080        break;
081      }
082
083      if (!killMetaRs && targetServer.equals(metaServer)) {
084        LOG.info("Not killing server because it holds hbase:meta.");
085      } else {
086        killRs(targetServer);
087        killedServers.add(targetServer);
088      }
089    }
090
091    Thread.sleep(waitForKillsMilliSec);
092    forceBalancer();
093    Thread.sleep(waitAfterBalanceMilliSec);
094    for (ServerName server:killedServers) {
095      startRs(server);
096    }
097  }
098}