001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.util.HashSet;
022import java.util.List;
023import java.util.Set;
024import org.apache.hadoop.hbase.ServerName;
025import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
026
027/**
028 * Restarts a ratio of the running regionservers at the same time
029 */
030public class BatchRestartRsAction extends RestartActionBaseAction {
031  float ratio; //ratio of regionservers to restart
032
033  public BatchRestartRsAction(long sleepTime, float ratio) {
034    super(sleepTime);
035    this.ratio = ratio;
036  }
037
038  @Override
039  public void perform() throws Exception {
040    LOG.info(String.format("Performing action: Batch restarting %d%% of region servers",
041        (int)(ratio * 100)));
042    List<ServerName> selectedServers = PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(),
043        ratio);
044
045    Set<ServerName> killedServers = new HashSet<>();
046
047    for (ServerName server : selectedServers) {
048      // Don't keep killing servers if we're
049      // trying to stop the monkey.
050      if (context.isStopping()) {
051        break;
052      }
053      LOG.info("Killing region server:" + server);
054      cluster.killRegionServer(server);
055      killedServers.add(server);
056    }
057
058    for (ServerName server : killedServers) {
059      cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
060    }
061
062    LOG.info("Killed " + killedServers.size() + " region servers. Reported num of rs:"
063        + cluster.getClusterMetrics().getLiveServerMetrics().size());
064
065    sleep(sleepTime);
066
067    for (ServerName server : killedServers) {
068      LOG.info("Starting region server:" + server.getHostname());
069      cluster.startRegionServer(server.getHostname(), server.getPort());
070
071    }
072    for (ServerName server : killedServers) {
073      cluster.waitForRegionServerToStart(server.getHostname(),
074          server.getPort(),
075          PolicyBasedChaosMonkey.TIMEOUT);
076    }
077    LOG.info("Started " + killedServers.size() +" region servers. Reported num of rs:"
078        + cluster.getClusterMetrics().getLiveServerMetrics().size());
079  }
080}