001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.LinkedList;
024import java.util.List;
025import java.util.Queue;
026
027import org.apache.commons.lang3.RandomUtils;
028import org.apache.hadoop.hbase.ServerName;
029import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033/**
034 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
035 * server, or starts one, sleeping randomly (0-sleepTime) in between steps.
036 * The parameter maxDeadServers limits the maximum number of servers that
037 * can be down at the same time during rolling restarts.
038 */
039public class RollingBatchRestartRsAction extends BatchRestartRsAction {
040  private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class);
041  protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5
042
043  public RollingBatchRestartRsAction(long sleepTime, float ratio) {
044    this(sleepTime, ratio, 5);
045  }
046
047  public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
048    super(sleepTime, ratio);
049    this.maxDeadServers = maxDeadServers;
050  }
051
052  enum KillOrStart {
053    KILL,
054    START
055  }
056
057  @Override
058  public void perform() throws Exception {
059    LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
060        (int)(ratio * 100)));
061    List<ServerName> selectedServers = selectServers();
062
063    Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers);
064    Queue<ServerName> deadServers = new LinkedList<>();
065
066    // loop while there are servers to be killed or dead servers to be restarted
067    while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty())  && !context.isStopping()) {
068      KillOrStart action = KillOrStart.KILL;
069
070      if (serversToBeKilled.isEmpty()) { // no more servers to kill
071        action = KillOrStart.START;
072      } else if (deadServers.isEmpty()) {
073        action = KillOrStart.KILL; // no more servers to start
074      } else if (deadServers.size() >= maxDeadServers) {
075        // we have too many dead servers. Don't kill any more
076        action = KillOrStart.START;
077      } else {
078        // do a coin toss
079        action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
080      }
081
082      ServerName server;
083
084      switch (action) {
085        case KILL:
086          server = serversToBeKilled.remove();
087          try {
088            killRs(server);
089          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
090            // We've seen this in test runs where we timeout but the kill went through. HBASE-9743
091            // So, add to deadServers even if exception so the start gets called.
092            LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e);
093          }
094          deadServers.add(server);
095          break;
096        case START:
097          try {
098            server = deadServers.remove();
099            startRs(server);
100          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
101            // The start may fail but better to just keep going though we may lose server.
102            //
103            LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
104          }
105          break;
106      }
107
108      sleep(RandomUtils.nextInt(0, (int)sleepTime));
109    }
110  }
111
112  protected List<ServerName> selectServers() throws IOException {
113    return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
114  }
115
116  /**
117   * Small test to ensure the class basically works.
118   * @param args
119   * @throws Exception
120   */
121  public static void main(final String[] args) throws Exception {
122    RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
123      private int invocations = 0;
124      @Override
125      protected ServerName[] getCurrentServers() throws IOException {
126        final int count = 4;
127        List<ServerName> serverNames = new ArrayList<>(count);
128        for (int i = 0; i < 4; i++) {
129          serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
130        }
131        return serverNames.toArray(new ServerName[serverNames.size()]);
132      }
133
134      @Override
135      protected void killRs(ServerName server) throws IOException {
136        LOG.info("Killed " + server);
137        if (this.invocations++ % 3 == 0) {
138          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
139        }
140      }
141
142      @Override
143      protected void startRs(ServerName server) throws IOException {
144        LOG.info("Started " + server);
145        if (this.invocations++ % 3 == 0) {
146          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
147        }
148      }
149    };
150
151    action.perform();
152  }
153}