001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.util.HashSet; 022import java.util.List; 023import java.util.Set; 024import org.apache.hadoop.hbase.ServerName; 025import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029/** 030 * Restarts a ratio of the running regionservers at the same time 031 */ 032public class BatchRestartRsAction extends RestartActionBaseAction { 033 float ratio; //ratio of regionservers to restart 034 private static final Logger LOG = LoggerFactory.getLogger(BatchRestartRsAction.class); 035 036 public BatchRestartRsAction(long sleepTime, float ratio) { 037 super(sleepTime); 038 this.ratio = ratio; 039 } 040 041 @Override protected Logger getLogger() { 042 return LOG; 043 } 044 045 @Override 046 public void perform() throws Exception { 047 getLogger().info(String.format("Performing action: Batch restarting %d%% of region servers", 048 (int)(ratio * 100))); 049 List<ServerName> selectedServers = PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), 050 ratio); 051 052 Set<ServerName> killedServers = new HashSet<>(); 053 054 for (ServerName server : selectedServers) { 055 // Don't keep killing servers if we're 056 // trying to stop the monkey. 057 if (context.isStopping()) { 058 break; 059 } 060 getLogger().info("Killing region server:" + server); 061 cluster.killRegionServer(server); 062 killedServers.add(server); 063 } 064 065 for (ServerName server : killedServers) { 066 cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT); 067 } 068 069 getLogger().info("Killed " + killedServers.size() + " region servers. Reported num of rs:" 070 + cluster.getClusterMetrics().getLiveServerMetrics().size()); 071 072 sleep(sleepTime); 073 074 for (ServerName server : killedServers) { 075 getLogger().info("Starting region server:" + server.getHostname()); 076 cluster.startRegionServer(server.getHostname(), server.getPort()); 077 078 } 079 for (ServerName server : killedServers) { 080 cluster.waitForRegionServerToStart(server.getHostname(), 081 server.getPort(), 082 PolicyBasedChaosMonkey.TIMEOUT); 083 } 084 getLogger().info("Started " + killedServers.size() +" region servers. Reported num of rs:" 085 + cluster.getClusterMetrics().getLiveServerMetrics().size()); 086 } 087}