001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.util.HashSet;
022import java.util.List;
023import java.util.Set;
024
025import org.apache.hadoop.hbase.ServerName;
026import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030/**
031 * Restarts a ratio of the running regionservers at the same time
032 */
033public class BatchRestartRsAction extends RestartActionBaseAction {
034  float ratio; //ratio of regionservers to restart
035  private static final Logger LOG =
036      LoggerFactory.getLogger(BatchRestartRsAction.class);
037
038  public BatchRestartRsAction(long sleepTime, float ratio) {
039    super(sleepTime);
040    this.ratio = ratio;
041  }
042
043  @Override
044  public void perform() throws Exception {
045    LOG.info(String.format("Performing action: Batch restarting %d%% of region servers",
046        (int)(ratio * 100)));
047    List<ServerName> selectedServers = PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(),
048        ratio);
049
050    Set<ServerName> killedServers = new HashSet<>();
051
052    for (ServerName server : selectedServers) {
053      // Don't keep killing servers if we're
054      // trying to stop the monkey.
055      if (context.isStopping()) {
056        break;
057      }
058      LOG.info("Killing region server:" + server);
059      cluster.killRegionServer(server);
060      killedServers.add(server);
061    }
062
063    for (ServerName server : killedServers) {
064      cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
065    }
066
067    LOG.info("Killed " + killedServers.size() + " region servers. Reported num of rs:"
068        + cluster.getClusterMetrics().getLiveServerMetrics().size());
069
070    sleep(sleepTime);
071
072    for (ServerName server : killedServers) {
073      LOG.info("Starting region server:" + server.getHostname());
074      cluster.startRegionServer(server.getHostname(), server.getPort());
075
076    }
077    for (ServerName server : killedServers) {
078      cluster.waitForRegionServerToStart(server.getHostname(),
079          server.getPort(),
080          PolicyBasedChaosMonkey.TIMEOUT);
081    }
082    LOG.info("Started " + killedServers.size() +" region servers. Reported num of rs:"
083        + cluster.getClusterMetrics().getLiveServerMetrics().size());
084  }
085}