001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.LinkedList;
024import java.util.List;
025import java.util.Queue;
026
027import org.apache.commons.lang3.RandomUtils;
028import org.apache.hadoop.hbase.ServerName;
029import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033/**
034 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
035 * server, or starts one, sleeping randomly (0-sleepTime) in between steps. The parameter maxDeadServers
036 * limits the maximum number of servers that can be down at the same time during rolling restarts.
037 */
038public class RollingBatchRestartRsAction extends BatchRestartRsAction {
039  private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class);
040  protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5
041
042  public RollingBatchRestartRsAction(long sleepTime, float ratio) {
043    this(sleepTime, ratio, 5);
044  }
045
046  public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
047    super(sleepTime, ratio);
048    this.maxDeadServers = maxDeadServers;
049  }
050
051  enum KillOrStart {
052    KILL,
053    START
054  }
055
056  @Override
057  public void perform() throws Exception {
058    LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
059        (int)(ratio * 100)));
060    List<ServerName> selectedServers = selectServers();
061
062    Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers);
063    Queue<ServerName> deadServers = new LinkedList<>();
064
065    // loop while there are servers to be killed or dead servers to be restarted
066    while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty())  && !context.isStopping()) {
067      KillOrStart action = KillOrStart.KILL;
068
069      if (serversToBeKilled.isEmpty()) { // no more servers to kill
070        action = KillOrStart.START;
071      } else if (deadServers.isEmpty()) {
072        action = KillOrStart.KILL; // no more servers to start
073      } else if (deadServers.size() >= maxDeadServers) {
074        // we have too many dead servers. Don't kill any more
075        action = KillOrStart.START;
076      } else {
077        // do a coin toss
078        action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
079      }
080
081      ServerName server;
082
083      switch (action) {
084      case KILL:
085         server = serversToBeKilled.remove();
086        try {
087          killRs(server);
088        } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
089          // We've seen this in test runs where we timeout but the kill went through. HBASE-9743
090          // So, add to deadServers even if exception so the start gets called.
091          LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e);
092        }
093        deadServers.add(server);
094        break;
095      case START:
096        try {
097          server = deadServers.remove();
098          startRs(server);
099        } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
100          // The start may fail but better to just keep going though we may lose server.
101          //
102          LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
103        }
104        break;
105      }
106
107      sleep(RandomUtils.nextInt(0, (int)sleepTime));
108    }
109  }
110
111  protected List<ServerName> selectServers() throws IOException {
112    return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
113  }
114
115  /**
116   * Small test to ensure the class basically works.
117   * @param args
118   * @throws Exception
119   */
120  public static void main(final String[] args) throws Exception {
121    RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
122      private int invocations = 0;
123      @Override
124      protected ServerName[] getCurrentServers() throws IOException {
125        final int count = 4;
126        List<ServerName> serverNames = new ArrayList<>(count);
127        for (int i = 0; i < 4; i++) {
128          serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
129        }
130        return serverNames.toArray(new ServerName[serverNames.size()]);
131      }
132
133      @Override
134      protected void killRs(ServerName server) throws IOException {
135        LOG.info("Killed " + server);
136        if (this.invocations++ % 3 == 0) {
137          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
138        }
139      }
140
141      @Override
142      protected void startRs(ServerName server) throws IOException {
143        LOG.info("Started " + server);
144        if (this.invocations++ % 3 == 0) {
145          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
146        }
147      }
148    };
149
150    action.perform();
151  }
152}