001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.chaos.actions;
019
020import java.io.IOException;
021import java.util.ArrayDeque;
022import java.util.ArrayList;
023import java.util.Collections;
024import java.util.LinkedList;
025import java.util.List;
026import java.util.Objects;
027import java.util.Queue;
028import java.util.Random;
029import java.util.concurrent.ThreadLocalRandom;
030import org.apache.hadoop.hbase.ServerName;
031import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035/**
036 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a server,
037 * or starts one, sleeping randomly (0-sleepTime) in between steps. The parameter maxDeadServers
038 * limits the maximum number of servers that can be down at the same time during rolling restarts.
039 */
040public class RollingBatchRestartRsAction extends BatchRestartRsAction {
041  private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class);
042  protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5
043
044  public RollingBatchRestartRsAction(long sleepTime, float ratio) {
045    this(sleepTime, ratio, 5);
046  }
047
048  public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
049    super(sleepTime, ratio);
050    this.maxDeadServers = maxDeadServers;
051  }
052
053  public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers,
054    boolean skipMetaRS) {
055    this(sleepTime, ratio, maxDeadServers);
056    this.skipMetaRS = skipMetaRS;
057  }
058
059  enum KillOrStart {
060    KILL,
061    START
062  }
063
064  @Override
065  protected Logger getLogger() {
066    return LOG;
067  }
068
069  @Override
070  // deadServers is both list and queue here, a valid use case for LinkedList
071  @SuppressWarnings("JdkObsolete")
072  public void perform() throws Exception {
073    getLogger().info("Performing action: Rolling batch restarting {}% of region servers",
074      (int) (ratio * 100));
075    List<ServerName> selectedServers = selectServers();
076    Queue<ServerName> serversToBeKilled = new ArrayDeque<>(selectedServers);
077    LinkedList<ServerName> deadServers = new LinkedList<>();
078    Random rand = ThreadLocalRandom.current();
079    // loop while there are servers to be killed or dead servers to be restarted
080    while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) {
081
082      final KillOrStart action;
083      if (serversToBeKilled.isEmpty()) { // no more servers to kill
084        action = KillOrStart.START;
085      } else if (deadServers.isEmpty()) {
086        action = KillOrStart.KILL; // no more servers to start
087      } else if (deadServers.size() >= maxDeadServers) {
088        // we have too many dead servers. Don't kill any more
089        action = KillOrStart.START;
090      } else {
091        // do a coin toss
092        action = rand.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
093      }
094
095      ServerName server;
096
097      switch (action) {
098        case KILL:
099          server = serversToBeKilled.remove();
100          try {
101            killRs(server);
102          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
103            // We've seen this in test runs where we timeout but the kill went through. HBASE-9743
104            // So, add to deadServers even if exception so the start gets called.
105            getLogger().info("Problem killing but presume successful; code={}", e.getExitCode(), e);
106          }
107          deadServers.add(server);
108          break;
109        case START:
110          server = Objects.requireNonNull(deadServers.peek());
111          try {
112            startRs(server);
113            // only remove the server from the known dead list if `startRs` succeeds.
114            deadServers.remove(server);
115          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
116            // The start may fail but better to just keep going though we may lose server.
117            // Shuffle the dead list to avoid getting stuck on a single stubborn host.
118            Collections.shuffle(deadServers);
119            getLogger().info("Problem starting {}, will retry; code={}", server, e.getExitCode(),
120              e);
121          }
122          break;
123      }
124
125      sleep(rand.nextInt((int) sleepTime));
126    }
127  }
128
129  protected List<ServerName> selectServers() throws IOException {
130    return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
131  }
132
133  /**
134   * Small test to ensure the class basically works.
135   */
136  public static void main(final String[] args) throws Exception {
137    RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
138      private int invocations = 0;
139
140      @Override
141      protected ServerName[] getCurrentServers() {
142        final int count = 4;
143        List<ServerName> serverNames = new ArrayList<>(count);
144        for (int i = 0; i < 4; i++) {
145          serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
146        }
147        return serverNames.toArray(new ServerName[0]);
148      }
149
150      @Override
151      protected void killRs(ServerName server) throws IOException {
152        LOG.info("Killed {}", server);
153        if (this.invocations++ % 3 == 0) {
154          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
155        }
156      }
157
158      @Override
159      protected void startRs(ServerName server) throws IOException {
160        LOG.info("Started {}", server);
161        if (this.invocations++ % 3 == 0) {
162          throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
163        }
164      }
165    };
166
167    action.perform();
168  }
169}