001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.util.ArrayList; 022import java.util.HashSet; 023import java.util.LinkedList; 024import java.util.List; 025import java.util.Set; 026 027import org.apache.commons.lang3.RandomUtils; 028import org.apache.hadoop.hbase.ClusterMetrics; 029import org.apache.hadoop.hbase.ServerName; 030import org.junit.Assert; 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034/** This action is too specific to put in ChaosMonkey; put it here */ 035public class UnbalanceKillAndRebalanceAction extends Action { 036 private static final Logger LOG = 037 LoggerFactory.getLogger(UnbalanceKillAndRebalanceAction.class); 038 /** Fractions of servers to get regions and live and die respectively; from all other 039 * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */ 040 private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1; 041 private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1; 042 private static final double HOARD_FRC_OF_REGIONS = 0.8; 043 /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance 044 * and restarting the servers; to make sure these events have time to impact the cluster. */ 045 private long waitForUnbalanceMilliSec; 046 private long waitForKillsMilliSec; 047 private long waitAfterBalanceMilliSec; 048 private boolean killMetaRs; 049 050 public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance, 051 boolean killMetaRs) { 052 super(); 053 waitForUnbalanceMilliSec = waitUnbalance; 054 waitForKillsMilliSec = waitKill; 055 waitAfterBalanceMilliSec = waitAfterBalance; 056 this.killMetaRs = killMetaRs; 057 } 058 059 @Override 060 public void perform() throws Exception { 061 ClusterMetrics status = this.cluster.getClusterMetrics(); 062 List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet()); 063 Set<ServerName> killedServers = new HashSet<>(); 064 065 int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size()); 066 int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size()); 067 Assert.assertTrue( 068 "There are not enough victim servers: " + victimServers.size(), 069 liveCount + deadCount < victimServers.size()); 070 List<ServerName> targetServers = new ArrayList<>(liveCount); 071 for (int i = 0; i < liveCount + deadCount; ++i) { 072 int victimIx = RandomUtils.nextInt(0, victimServers.size()); 073 targetServers.add(victimServers.remove(victimIx)); 074 } 075 unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS); 076 Thread.sleep(waitForUnbalanceMilliSec); 077 ServerName metaServer = cluster.getServerHoldingMeta(); 078 for (ServerName targetServer: targetServers) { 079 // Don't keep killing servers if we're 080 // trying to stop the monkey. 081 if (context.isStopping()) { 082 break; 083 } 084 if (killedServers.size() >= liveCount) { 085 break; 086 } 087 088 if (!killMetaRs && targetServer.equals(metaServer)) { 089 LOG.info("Not killing server because it holds hbase:meta."); 090 } else { 091 killRs(targetServer); 092 killedServers.add(targetServer); 093 } 094 } 095 096 Thread.sleep(waitForKillsMilliSec); 097 forceBalancer(); 098 Thread.sleep(waitAfterBalanceMilliSec); 099 for (ServerName server:killedServers) { 100 startRs(server); 101 } 102 } 103}