001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.chaos.actions; 019 020import java.io.IOException; 021import java.util.ArrayDeque; 022import java.util.List; 023import java.util.Queue; 024import java.util.Random; 025import java.util.concurrent.ThreadLocalRandom; 026import org.apache.hadoop.hbase.ServerName; 027import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 028import org.apache.hadoop.hbase.util.Threads; 029import org.apache.hadoop.util.Shell; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033/** 034 * Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either 035 * suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter 036 * maxSuspendedServers limits the maximum number of servers that can be down at the same time during 037 * rolling restarts. 038 */ 039public class RollingBatchSuspendResumeRsAction extends Action { 040 private static final Logger LOG = 041 LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class); 042 private final float ratio; 043 private final long sleepTime; 044 private final int maxSuspendedServers; // number of maximum suspended servers at any given time. 045 046 public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) { 047 this(sleepTime, ratio, 5); 048 } 049 050 public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) { 051 this.ratio = ratio; 052 this.sleepTime = sleepTime; 053 this.maxSuspendedServers = maxSuspendedServers; 054 } 055 056 enum SuspendOrResume { 057 SUSPEND, 058 RESUME 059 } 060 061 @Override 062 protected Logger getLogger() { 063 return LOG; 064 } 065 066 @Override 067 public void perform() throws Exception { 068 getLogger().info("Performing action: Rolling batch suspending {}% of region servers", 069 (int) (ratio * 100)); 070 List<ServerName> selectedServers = selectServers(); 071 Queue<ServerName> serversToBeSuspended = new ArrayDeque<>(selectedServers); 072 Queue<ServerName> suspendedServers = new ArrayDeque<>(); 073 Random rand = ThreadLocalRandom.current(); 074 // loop while there are servers to be suspended or suspended servers to be resumed 075 while ( 076 (!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context.isStopping() 077 ) { 078 079 final SuspendOrResume action; 080 if (serversToBeSuspended.isEmpty()) { // no more servers to suspend 081 action = SuspendOrResume.RESUME; 082 } else if (suspendedServers.isEmpty()) { 083 action = SuspendOrResume.SUSPEND; // no more servers to resume 084 } else if (suspendedServers.size() >= maxSuspendedServers) { 085 // we have too many suspended servers. Don't suspend any more 086 action = SuspendOrResume.RESUME; 087 } else { 088 // do a coin toss 089 action = rand.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME; 090 } 091 092 ServerName server; 093 switch (action) { 094 case SUSPEND: 095 server = serversToBeSuspended.remove(); 096 try { 097 suspendRs(server); 098 } catch (Shell.ExitCodeException e) { 099 LOG.warn("Problem suspending but presume successful; code={}", e.getExitCode(), e); 100 } 101 suspendedServers.add(server); 102 break; 103 case RESUME: 104 server = suspendedServers.remove(); 105 try { 106 resumeRs(server); 107 } catch (Shell.ExitCodeException e) { 108 LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), e); 109 } 110 break; 111 } 112 113 getLogger().info("Sleeping for:{}", sleepTime); 114 Threads.sleep(sleepTime); 115 } 116 } 117 118 protected List<ServerName> selectServers() throws IOException { 119 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); 120 } 121 122}