001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.chaos.actions;
019
020import java.io.IOException;
021import java.util.ArrayDeque;
022import java.util.List;
023import java.util.Queue;
024import java.util.Random;
025import java.util.concurrent.ThreadLocalRandom;
026import org.apache.hadoop.hbase.ServerName;
027import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
028import org.apache.hadoop.hbase.util.Threads;
029import org.apache.hadoop.util.Shell;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033/**
034 * Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either
035 * suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter
036 * maxSuspendedServers limits the maximum number of servers that can be down at the same time during
037 * rolling restarts.
038 */
039public class RollingBatchSuspendResumeRsAction extends Action {
040  private static final Logger LOG =
041    LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class);
042  private final float ratio;
043  private final long sleepTime;
044  private final int maxSuspendedServers; // number of maximum suspended servers at any given time.
045
046  public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) {
047    this(sleepTime, ratio, 5);
048  }
049
050  public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) {
051    this.ratio = ratio;
052    this.sleepTime = sleepTime;
053    this.maxSuspendedServers = maxSuspendedServers;
054  }
055
056  enum SuspendOrResume {
057    SUSPEND,
058    RESUME
059  }
060
061  @Override
062  protected Logger getLogger() {
063    return LOG;
064  }
065
066  @Override
067  public void perform() throws Exception {
068    getLogger().info("Performing action: Rolling batch restarting {}% of region servers",
069      (int) (ratio * 100));
070    List<ServerName> selectedServers = selectServers();
071    Queue<ServerName> serversToBeSuspended = new ArrayDeque<>(selectedServers);
072    Queue<ServerName> suspendedServers = new ArrayDeque<>();
073    Random rand = ThreadLocalRandom.current();
074    // loop while there are servers to be suspended or suspended servers to be resumed
075    while (
076      (!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context.isStopping()
077    ) {
078
079      final SuspendOrResume action;
080      if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
081        action = SuspendOrResume.RESUME;
082      } else if (suspendedServers.isEmpty()) {
083        action = SuspendOrResume.SUSPEND; // no more servers to resume
084      } else if (suspendedServers.size() >= maxSuspendedServers) {
085        // we have too many suspended servers. Don't suspend any more
086        action = SuspendOrResume.RESUME;
087      } else {
088        // do a coin toss
089        action = rand.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME;
090      }
091
092      ServerName server;
093      switch (action) {
094        case SUSPEND:
095          server = serversToBeSuspended.remove();
096          try {
097            suspendRs(server);
098          } catch (Shell.ExitCodeException e) {
099            LOG.warn("Problem suspending but presume successful; code={}", e.getExitCode(), e);
100          }
101          suspendedServers.add(server);
102          break;
103        case RESUME:
104          server = suspendedServers.remove();
105          try {
106            resumeRs(server);
107          } catch (Shell.ExitCodeException e) {
108            LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), e);
109          }
110          break;
111      }
112
113      getLogger().info("Sleeping for:{}", sleepTime);
114      Threads.sleep(sleepTime);
115    }
116  }
117
118  protected List<ServerName> selectServers() throws IOException {
119    return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
120  }
121
122}