001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.chaos.actions; 019 020import java.util.ArrayList; 021import java.util.HashSet; 022import java.util.LinkedList; 023import java.util.List; 024import java.util.Random; 025import java.util.Set; 026import java.util.concurrent.ThreadLocalRandom; 027import org.apache.hadoop.hbase.ClusterMetrics; 028import org.apache.hadoop.hbase.ServerName; 029import org.junit.Assert; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033/** This action is too specific to put in ChaosMonkey; put it here */ 034public class UnbalanceKillAndRebalanceAction extends Action { 035 private static final Logger LOG = LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class); 036 /** 037 * Fractions of servers to get regions and live and die respectively; from all other servers, 038 * HOARD_FRC_OF_REGIONS will be removed to the above randomly 039 */ 040 private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1; 041 private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1; 042 private static final double HOARD_FRC_OF_REGIONS = 0.8; 043 /** 044 * Waits between calling unbalance and killing servers, kills and rebalance, and rebalance and 045 * restarting the servers; to make sure these events have time to impact the cluster. 046 */ 047 private final long waitForUnbalanceMilliSec; 048 private final long waitForKillsMilliSec; 049 private final long waitAfterBalanceMilliSec; 050 private final boolean killMetaRs; 051 052 public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance, 053 boolean killMetaRs) { 054 super(); 055 waitForUnbalanceMilliSec = waitUnbalance; 056 waitForKillsMilliSec = waitKill; 057 waitAfterBalanceMilliSec = waitAfterBalance; 058 this.killMetaRs = killMetaRs; 059 } 060 061 @Override 062 protected Logger getLogger() { 063 return LOG; 064 } 065 066 @Override 067 public void perform() throws Exception { 068 ClusterMetrics status = this.cluster.getClusterMetrics(); 069 List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet()); 070 Set<ServerName> killedServers = new HashSet<>(); 071 int liveCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size()); 072 int deadCount = (int) Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size()); 073 Assert.assertTrue("There are not enough victim servers: " + victimServers.size(), 074 liveCount + deadCount < victimServers.size()); 075 Random rand = ThreadLocalRandom.current(); 076 List<ServerName> targetServers = new ArrayList<>(liveCount); 077 for (int i = 0; i < liveCount + deadCount; ++i) { 078 int victimIx = rand.nextInt(victimServers.size()); 079 targetServers.add(victimServers.remove(victimIx)); 080 } 081 unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS); 082 Thread.sleep(waitForUnbalanceMilliSec); 083 ServerName metaServer = cluster.getServerHoldingMeta(); 084 for (ServerName targetServer : targetServers) { 085 // Don't keep killing servers if we're 086 // trying to stop the monkey. 087 if (context.isStopping()) { 088 break; 089 } 090 if (killedServers.size() >= liveCount) { 091 break; 092 } 093 094 if (!killMetaRs && targetServer.equals(metaServer)) { 095 getLogger().info("Not killing server because it holds hbase:meta."); 096 } else { 097 killRs(targetServer); 098 killedServers.add(targetServer); 099 } 100 } 101 102 Thread.sleep(waitForKillsMilliSec); 103 forceBalancer(); 104 Thread.sleep(waitAfterBalanceMilliSec); 105 for (ServerName server : killedServers) { 106 startRs(server); 107 } 108 } 109}