001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.util.HashSet; 022import java.util.List; 023import java.util.Set; 024 025import org.apache.hadoop.hbase.ServerName; 026import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030/** 031 * Restarts a ratio of the running regionservers at the same time 032 */ 033public class BatchRestartRsAction extends RestartActionBaseAction { 034 float ratio; //ratio of regionservers to restart 035 private static final Logger LOG = 036 LoggerFactory.getLogger(BatchRestartRsAction.class); 037 038 public BatchRestartRsAction(long sleepTime, float ratio) { 039 super(sleepTime); 040 this.ratio = ratio; 041 } 042 043 @Override 044 public void perform() throws Exception { 045 LOG.info(String.format("Performing action: Batch restarting %d%% of region servers", 046 (int)(ratio * 100))); 047 List<ServerName> selectedServers = PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), 048 ratio); 049 050 Set<ServerName> killedServers = new HashSet<>(); 051 052 for (ServerName server : selectedServers) { 053 // Don't keep killing servers if we're 054 // trying to stop the monkey. 055 if (context.isStopping()) { 056 break; 057 } 058 LOG.info("Killing region server:" + server); 059 cluster.killRegionServer(server); 060 killedServers.add(server); 061 } 062 063 for (ServerName server : killedServers) { 064 cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT); 065 } 066 067 LOG.info("Killed " + killedServers.size() + " region servers. Reported num of rs:" 068 + cluster.getClusterMetrics().getLiveServerMetrics().size()); 069 070 sleep(sleepTime); 071 072 for (ServerName server : killedServers) { 073 LOG.info("Starting region server:" + server.getHostname()); 074 cluster.startRegionServer(server.getHostname(), server.getPort()); 075 076 } 077 for (ServerName server : killedServers) { 078 cluster.waitForRegionServerToStart(server.getHostname(), 079 server.getPort(), 080 PolicyBasedChaosMonkey.TIMEOUT); 081 } 082 LOG.info("Started " + killedServers.size() +" region servers. Reported num of rs:" 083 + cluster.getClusterMetrics().getLiveServerMetrics().size()); 084 } 085}