001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.Collections;
024import java.util.LinkedList;
025import java.util.List;
026import java.util.Objects;
027import java.util.Queue;
028import org.apache.commons.lang3.RandomUtils;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034/**
035 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
036 * server, or starts one, sleeping randomly (0-sleepTime) in between steps.
037 * The parameter maxDeadServers limits the maximum number of servers that
038 * can be down at the same time during rolling restarts.
039 */
040public class RollingBatchRestartRsAction extends BatchRestartRsAction {
041  private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class);
042  protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5
043
044  public RollingBatchRestartRsAction(long sleepTime, float ratio) {
045    this(sleepTime, ratio, 5);
046  }
047
048  public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
049    super(sleepTime, ratio);
050    this.maxDeadServers = maxDeadServers;
051  }
052
053  public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers,
054    boolean skipMetaRS) {
055    this(sleepTime, ratio, maxDeadServers);
056    this.skipMetaRS = skipMetaRS;
057  }
058
059  enum KillOrStart {
060    KILL,
061    START
062  }
063
064  @Override protected Logger getLogger() {
065    return LOG;
066  }
067
068  @Override
069  public void perform() throws Exception {
070    getLogger().info("Performing action: Rolling batch restarting {}% of region servers",
071        (int)(ratio * 100));
072    List<ServerName> selectedServers = selectServers();
073
074    Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers);
075    LinkedList<ServerName> deadServers = new LinkedList<>();
076
077    // loop while there are servers to be killed or dead servers to be restarted
078    while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty())  && !context.isStopping()) {
079
080      final KillOrStart action;
081      if (serversToBeKilled.isEmpty()) { // no more servers to kill
082        action = KillOrStart.START;
083      } else if (deadServers.isEmpty()) {
084        action = KillOrStart.KILL; // no more servers to start
085      } else if (deadServers.size() >= maxDeadServers) {
086        // we have too many dead servers. Don't kill any more
087        action = KillOrStart.START;
088      } else {
089        // do a coin toss
090        action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
091      }
092
093      ServerName server;
094
095      switch (action) {
096        case KILL:
097          server = serversToBeKilled.remove();
098          try {
099            killRs(server);
100          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
101            // We've seen this in test runs where we timeout but the kill went through. HBASE-9743
102            // So, add to deadServers even if exception so the start gets called.
103            getLogger().info("Problem killing but presume successful; code={}", e.getExitCode(), e);
104          }
105          deadServers.add(server);
106          break;
107        case START:
108          server = Objects.requireNonNull(deadServers.peek());
109          try {
110            startRs(server);
111            // only remove the server from the known dead list if `startRs` succeeds.
112            deadServers.remove(server);
113          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
114            // The start may fail but better to just keep going though we may lose server.
115            // Shuffle the dead list to avoid getting stuck on a single stubborn host.
116            Collections.shuffle(deadServers);
117            getLogger().info(
118              "Problem starting {}, will retry; code={}", server, e.getExitCode(), e);
119          }
120          break;
121      }
122
123      sleep(RandomUtils.nextInt(0, (int)sleepTime));
124    }
125  }
126
127  protected List<ServerName> selectServers() throws IOException {
128    return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
129  }
130
131  /**
132   * Small test to ensure the class basically works.
133   */
134  public static void main(final String[] args) throws Exception {
135    RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
136      private int invocations = 0;
137      @Override
138      protected ServerName[] getCurrentServers() {
139        final int count = 4;
140        List<ServerName> serverNames = new ArrayList<>(count);
141        for (int i = 0; i < 4; i++) {
142          serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
143        }
144        return serverNames.toArray(new ServerName[0]);
145      }
146
147      @Override
148      protected void killRs(ServerName server) throws IOException {
149        LOG.info("Killed {}", server);
150        if (this.invocations++ % 3 == 0) {
151          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
152        }
153      }
154
155      @Override
156      protected void startRs(ServerName server) throws IOException {
157        LOG.info("Started {}", server);
158        if (this.invocations++ % 3 == 0) {
159          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
160        }
161      }
162    };
163
164    action.perform();
165  }
166}