001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.util.ArrayList; 022import java.util.HashSet; 023import java.util.LinkedList; 024import java.util.List; 025import java.util.Set; 026import org.apache.commons.lang3.RandomUtils; 027import org.apache.hadoop.hbase.ClusterMetrics; 028import org.apache.hadoop.hbase.ServerName; 029import org.junit.Assert; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033/** This action is too specific to put in ChaosMonkey; put it here */ 034public class UnbalanceKillAndRebalanceAction extends Action { 035 private static final Logger LOG = 036 LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class); 037 /** Fractions of servers to get regions and live and die respectively; from all other 038 * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */ 039 private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1; 040 private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1; 041 private static final double HOARD_FRC_OF_REGIONS = 0.8; 042 /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance 043 * and restarting the servers; to make sure these events have time to impact the cluster. */ 044 private final long waitForUnbalanceMilliSec; 045 private final long waitForKillsMilliSec; 046 private final long waitAfterBalanceMilliSec; 047 private final boolean killMetaRs; 048 049 public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance, 050 boolean killMetaRs) { 051 super(); 052 waitForUnbalanceMilliSec = waitUnbalance; 053 waitForKillsMilliSec = waitKill; 054 waitAfterBalanceMilliSec = waitAfterBalance; 055 this.killMetaRs = killMetaRs; 056 } 057 058 @Override protected Logger getLogger() { 059 return LOG; 060 } 061 062 @Override 063 public void perform() throws Exception { 064 ClusterMetrics status = this.cluster.getClusterMetrics(); 065 List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet()); 066 Set<ServerName> killedServers = new HashSet<>(); 067 068 int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size()); 069 int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size()); 070 Assert.assertTrue( 071 "There are not enough victim servers: " + victimServers.size(), 072 liveCount + deadCount < victimServers.size()); 073 List<ServerName> targetServers = new ArrayList<>(liveCount); 074 for (int i = 0; i < liveCount + deadCount; ++i) { 075 int victimIx = RandomUtils.nextInt(0, victimServers.size()); 076 targetServers.add(victimServers.remove(victimIx)); 077 } 078 unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS); 079 Thread.sleep(waitForUnbalanceMilliSec); 080 ServerName metaServer = cluster.getServerHoldingMeta(); 081 for (ServerName targetServer: targetServers) { 082 // Don't keep killing servers if we're 083 // trying to stop the monkey. 084 if (context.isStopping()) { 085 break; 086 } 087 if (killedServers.size() >= liveCount) { 088 break; 089 } 090 091 if (!killMetaRs && targetServer.equals(metaServer)) { 092 getLogger().info("Not killing server because it holds hbase:meta."); 093 } else { 094 killRs(targetServer); 095 killedServers.add(targetServer); 096 } 097 } 098 099 Thread.sleep(waitForKillsMilliSec); 100 forceBalancer(); 101 Thread.sleep(waitAfterBalanceMilliSec); 102 for (ServerName server:killedServers) { 103 startRs(server); 104 } 105 } 106}