001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.io.IOException; 022import java.util.LinkedList; 023import java.util.List; 024import java.util.Queue; 025import org.apache.commons.lang3.RandomUtils; 026import org.apache.hadoop.hbase.ServerName; 027import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 028import org.apache.hadoop.hbase.util.Threads; 029import org.apache.hadoop.util.Shell; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033/** 034 * Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either 035 * suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter 036 * maxSuspendedServers limits the maximum number of servers that can be down at the same time 037 * during rolling restarts. 038 */ 039public class RollingBatchSuspendResumeRsAction extends Action { 040 private static final Logger LOG = 041 LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class); 042 private final float ratio; 043 private final long sleepTime; 044 private final int maxSuspendedServers; // number of maximum suspended servers at any given time. 045 046 public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) { 047 this(sleepTime, ratio, 5); 048 } 049 050 public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) { 051 this.ratio = ratio; 052 this.sleepTime = sleepTime; 053 this.maxSuspendedServers = maxSuspendedServers; 054 } 055 056 enum SuspendOrResume { 057 SUSPEND, RESUME 058 } 059 060 @Override protected Logger getLogger() { 061 return LOG; 062 } 063 064 @Override 065 public void perform() throws Exception { 066 getLogger().info("Performing action: Rolling batch restarting {}% of region servers", 067 (int) (ratio * 100)); 068 List<ServerName> selectedServers = selectServers(); 069 070 Queue<ServerName> serversToBeSuspended = new LinkedList<>(selectedServers); 071 Queue<ServerName> suspendedServers = new LinkedList<>(); 072 073 // loop while there are servers to be suspended or suspended servers to be resumed 074 while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context 075 .isStopping()) { 076 077 final SuspendOrResume action; 078 if (serversToBeSuspended.isEmpty()) { // no more servers to suspend 079 action = SuspendOrResume.RESUME; 080 } else if (suspendedServers.isEmpty()) { 081 action = SuspendOrResume.SUSPEND; // no more servers to resume 082 } else if (suspendedServers.size() >= maxSuspendedServers) { 083 // we have too many suspended servers. Don't suspend any more 084 action = SuspendOrResume.RESUME; 085 } else { 086 // do a coin toss 087 action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME; 088 } 089 090 ServerName server; 091 switch (action) { 092 case SUSPEND: 093 server = serversToBeSuspended.remove(); 094 try { 095 suspendRs(server); 096 } catch (Shell.ExitCodeException e) { 097 LOG.warn("Problem suspending but presume successful; code={}", e.getExitCode(), e); 098 } 099 suspendedServers.add(server); 100 break; 101 case RESUME: 102 server = suspendedServers.remove(); 103 try { 104 resumeRs(server); 105 } catch (Shell.ExitCodeException e) { 106 LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), e); 107 } 108 break; 109 } 110 111 getLogger().info("Sleeping for:{}", sleepTime); 112 Threads.sleep(sleepTime); 113 } 114 } 115 116 protected List<ServerName> selectServers() throws IOException { 117 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); 118 } 119 120}