001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.util.ArrayList; 022import java.util.HashSet; 023import java.util.LinkedList; 024import java.util.List; 025import java.util.Set; 026import org.apache.commons.lang3.RandomUtils; 027import org.apache.hadoop.hbase.ClusterMetrics; 028import org.apache.hadoop.hbase.ServerName; 029import org.junit.Assert; 030 031/** This action is too specific to put in ChaosMonkey; put it here */ 032public class UnbalanceKillAndRebalanceAction extends Action { 033 /** Fractions of servers to get regions and live and die respectively; from all other 034 * servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */ 035 private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1; 036 private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1; 037 private static final double HOARD_FRC_OF_REGIONS = 0.8; 038 /** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance 039 * and restarting the servers; to make sure these events have time to impact the cluster. */ 040 private long waitForUnbalanceMilliSec; 041 private long waitForKillsMilliSec; 042 private long waitAfterBalanceMilliSec; 043 private boolean killMetaRs; 044 045 public UnbalanceKillAndRebalanceAction(long waitUnbalance, long waitKill, long waitAfterBalance, 046 boolean killMetaRs) { 047 super(); 048 waitForUnbalanceMilliSec = waitUnbalance; 049 waitForKillsMilliSec = waitKill; 050 waitAfterBalanceMilliSec = waitAfterBalance; 051 this.killMetaRs = killMetaRs; 052 } 053 054 @Override 055 public void perform() throws Exception { 056 ClusterMetrics status = this.cluster.getClusterMetrics(); 057 List<ServerName> victimServers = new LinkedList<>(status.getLiveServerMetrics().keySet()); 058 Set<ServerName> killedServers = new HashSet<>(); 059 060 int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size()); 061 int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size()); 062 Assert.assertTrue( 063 "There are not enough victim servers: " + victimServers.size(), 064 liveCount + deadCount < victimServers.size()); 065 List<ServerName> targetServers = new ArrayList<>(liveCount); 066 for (int i = 0; i < liveCount + deadCount; ++i) { 067 int victimIx = RandomUtils.nextInt(0, victimServers.size()); 068 targetServers.add(victimServers.remove(victimIx)); 069 } 070 unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS); 071 Thread.sleep(waitForUnbalanceMilliSec); 072 ServerName metaServer = cluster.getServerHoldingMeta(); 073 for (ServerName targetServer: targetServers) { 074 // Don't keep killing servers if we're 075 // trying to stop the monkey. 076 if (context.isStopping()) { 077 break; 078 } 079 if (killedServers.size() >= liveCount) { 080 break; 081 } 082 083 if (!killMetaRs && targetServer.equals(metaServer)) { 084 LOG.info("Not killing server because it holds hbase:meta."); 085 } else { 086 killRs(targetServer); 087 killedServers.add(targetServer); 088 } 089 } 090 091 Thread.sleep(waitForKillsMilliSec); 092 forceBalancer(); 093 Thread.sleep(waitAfterBalanceMilliSec); 094 for (ServerName server:killedServers) { 095 startRs(server); 096 } 097 } 098}