001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.util.HashSet;
022import java.util.List;
023import java.util.Set;
024import org.apache.hadoop.hbase.ServerName;
025import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029/**
030 * Restarts a ratio of the running regionservers at the same time
031 */
032public class BatchRestartRsAction extends RestartActionBaseAction {
033  float ratio; //ratio of regionservers to restart
034  private static final Logger LOG = LoggerFactory.getLogger(BatchRestartRsAction.class);
035
036  public BatchRestartRsAction(long sleepTime, float ratio) {
037    super(sleepTime);
038    this.ratio = ratio;
039  }
040
041  @Override protected Logger getLogger() {
042    return LOG;
043  }
044
045  @Override
046  public void perform() throws Exception {
047    getLogger().info(String.format("Performing action: Batch restarting %d%% of region servers",
048        (int)(ratio * 100)));
049    List<ServerName> selectedServers = PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(),
050        ratio);
051
052    Set<ServerName> killedServers = new HashSet<>();
053
054    for (ServerName server : selectedServers) {
055      // Don't keep killing servers if we're
056      // trying to stop the monkey.
057      if (context.isStopping()) {
058        break;
059      }
060      getLogger().info("Killing region server:" + server);
061      cluster.killRegionServer(server);
062      killedServers.add(server);
063    }
064
065    for (ServerName server : killedServers) {
066      cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
067    }
068
069    getLogger().info("Killed " + killedServers.size() + " region servers. Reported num of rs:"
070        + cluster.getClusterMetrics().getLiveServerMetrics().size());
071
072    sleep(sleepTime);
073
074    for (ServerName server : killedServers) {
075      getLogger().info("Starting region server:" + server.getHostname());
076      cluster.startRegionServer(server.getHostname(), server.getPort());
077
078    }
079    for (ServerName server : killedServers) {
080      cluster.waitForRegionServerToStart(server.getHostname(),
081          server.getPort(),
082          PolicyBasedChaosMonkey.TIMEOUT);
083    }
084    getLogger().info("Started " + killedServers.size() +" region servers. Reported num of rs:"
085        + cluster.getClusterMetrics().getLiveServerMetrics().size());
086  }
087}