001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.chaos.actions; 019 020import java.util.ArrayList; 021import java.util.HashSet; 022import java.util.List; 023import java.util.Random; 024import java.util.Set; 025import java.util.concurrent.ThreadLocalRandom; 026import org.apache.hadoop.hbase.ClusterMetrics; 027import org.apache.hadoop.hbase.ServerName; 028import org.junit.Assert; 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032/** This action is too specific to put in ChaosMonkey; put it here */ 033public class UnbalanceKillAndRebalanceAction extends Action { 034 private static final Logger LOG = LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class); 035 /** 036 * Fractions of servers to get regions and live and die respectively; from all other servers, 037 * HOARD_FRC_OF_REGIONS will be removed to the above randomly 038 */ 039 private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1; 040 private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1; 041 private static final double HOARD_FRC_OF_REGIONS = 0.8; 042 /** 043 * Waits between calling unbalance and killing servers, kills and rebalance, and rebalance and 044 * restarting the servers; to make sure these events have time to impact the cluster. 045 */ 046 private final long waitForUnbalanceMilliSec; 047 private final long waitForKillsMilliSec; 048 private final long waitAfterBalanceMilliSec; 049 private final boolean killMetaRs; 050 051 public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance, 052 boolean killMetaRs) { 053 super(); 054 waitForUnbalanceMilliSec = waitUnbalance; 055 waitForKillsMilliSec = waitKill; 056 waitAfterBalanceMilliSec = waitAfterBalance; 057 this.killMetaRs = killMetaRs; 058 } 059 060 @Override 061 protected Logger getLogger() { 062 return LOG; 063 } 064 065 @Override 066 public void perform() throws Exception { 067 ClusterMetrics status = this.cluster.getClusterMetrics(); 068 List<ServerName> victimServers = new ArrayList<>(status.getLiveServerMetrics().keySet()); 069 Set<ServerName> killedServers = new HashSet<>(); 070 int liveCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size()); 071 int deadCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size()); 072 Assert.assertTrue("There are not enough victim servers: " + victimServers.size(), 073 liveCount + deadCount < victimServers.size()); 074 Random rand = ThreadLocalRandom.current(); 075 List<ServerName> targetServers = new ArrayList<>(liveCount); 076 for (int i = 0; i < liveCount + deadCount; ++i) { 077 int victimIx = rand.nextInt(victimServers.size()); 078 targetServers.add(victimServers.remove(victimIx)); 079 } 080 unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS); 081 Thread.sleep(waitForUnbalanceMilliSec); 082 ServerName metaServer = cluster.getServerHoldingMeta(); 083 for (ServerName targetServer : targetServers) { 084 // Don't keep killing servers if we're 085 // trying to stop the monkey. 086 if (context.isStopping()) { 087 break; 088 } 089 if (killedServers.size() >= liveCount) { 090 break; 091 } 092 093 if (!killMetaRs && targetServer.equals(metaServer)) { 094 getLogger().info("Not killing server because it holds hbase:meta."); 095 } else { 096 killRs(targetServer); 097 killedServers.add(targetServer); 098 } 099 } 100 101 Thread.sleep(waitForKillsMilliSec); 102 forceBalancer(); 103 Thread.sleep(waitAfterBalanceMilliSec); 104 for (ServerName server : killedServers) { 105 startRs(server); 106 } 107 } 108}