001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.chaos.actions;
020
021import java.io.IOException;
022import java.util.LinkedList;
023import java.util.List;
024import java.util.Queue;
025import org.apache.commons.lang3.RandomUtils;
026import org.apache.hadoop.hbase.ServerName;
027import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
028import org.apache.hadoop.hbase.util.Threads;
029import org.apache.hadoop.util.Shell;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033/**
034 * Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either
035 * suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter
036 * maxSuspendedServers limits the maximum number of servers that can be down at the same time
037 * during rolling restarts.
038 */
039public class RollingBatchSuspendResumeRsAction extends Action {
040  private static final Logger LOG =
041      LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class);
042  private final float ratio;
043  private final long sleepTime;
044  private final int maxSuspendedServers; // number of maximum suspended servers at any given time.
045
046  public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) {
047    this(sleepTime, ratio, 5);
048  }
049
050  public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) {
051    this.ratio = ratio;
052    this.sleepTime = sleepTime;
053    this.maxSuspendedServers = maxSuspendedServers;
054  }
055
056  enum SuspendOrResume {
057    SUSPEND, RESUME
058  }
059
060  @Override protected Logger getLogger() {
061    return LOG;
062  }
063
064  @Override
065  public void perform() throws Exception {
066    getLogger().info("Performing action: Rolling batch restarting {}% of region servers",
067      (int) (ratio * 100));
068    List<ServerName> selectedServers = selectServers();
069
070    Queue<ServerName> serversToBeSuspended = new LinkedList<>(selectedServers);
071    Queue<ServerName> suspendedServers = new LinkedList<>();
072
073    // loop while there are servers to be suspended or suspended servers to be resumed
074    while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context
075        .isStopping()) {
076
077      final SuspendOrResume action;
078      if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
079        action = SuspendOrResume.RESUME;
080      } else if (suspendedServers.isEmpty()) {
081        action = SuspendOrResume.SUSPEND; // no more servers to resume
082      } else if (suspendedServers.size() >= maxSuspendedServers) {
083        // we have too many suspended servers. Don't suspend any more
084        action = SuspendOrResume.RESUME;
085      } else {
086        // do a coin toss
087        action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME;
088      }
089
090      ServerName server;
091      switch (action) {
092        case SUSPEND:
093          server = serversToBeSuspended.remove();
094          try {
095            suspendRs(server);
096          } catch (Shell.ExitCodeException e) {
097            LOG.warn("Problem suspending but presume successful; code={}", e.getExitCode(), e);
098          }
099          suspendedServers.add(server);
100          break;
101        case RESUME:
102          server = suspendedServers.remove();
103          try {
104            resumeRs(server);
105          } catch (Shell.ExitCodeException e) {
106            LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), e);
107          }
108          break;
109      }
110
111      getLogger().info("Sleeping for:{}", sleepTime);
112      Threads.sleep(sleepTime);
113    }
114  }
115
116  protected List<ServerName> selectServers() throws IOException {
117    return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
118  }
119
120}