001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.io.IOException; 022import java.util.ArrayList; 023import java.util.LinkedList; 024import java.util.List; 025import java.util.Queue; 026 027import org.apache.commons.lang3.RandomUtils; 028import org.apache.hadoop.hbase.ServerName; 029import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033/** 034 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a 035 * server, or starts one, sleeping randomly (0-sleepTime) in between steps. 036 * The parameter maxDeadServers limits the maximum number of servers that 037 * can be down at the same time during rolling restarts. 038 */ 039public class RollingBatchRestartRsAction extends BatchRestartRsAction { 040 private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class); 041 protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5 042 043 public RollingBatchRestartRsAction(long sleepTime, float ratio) { 044 this(sleepTime, ratio, 5); 045 } 046 047 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) { 048 super(sleepTime, ratio); 049 this.maxDeadServers = maxDeadServers; 050 } 051 052 enum KillOrStart { 053 KILL, 054 START 055 } 056 057 @Override 058 public void perform() throws Exception { 059 LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers", 060 (int)(ratio * 100))); 061 List<ServerName> selectedServers = selectServers(); 062 063 Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers); 064 Queue<ServerName> deadServers = new LinkedList<>(); 065 066 // loop while there are servers to be killed or dead servers to be restarted 067 while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) { 068 KillOrStart action = KillOrStart.KILL; 069 070 if (serversToBeKilled.isEmpty()) { // no more servers to kill 071 action = KillOrStart.START; 072 } else if (deadServers.isEmpty()) { 073 action = KillOrStart.KILL; // no more servers to start 074 } else if (deadServers.size() >= maxDeadServers) { 075 // we have too many dead servers. Don't kill any more 076 action = KillOrStart.START; 077 } else { 078 // do a coin toss 079 action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START; 080 } 081 082 ServerName server; 083 084 switch (action) { 085 case KILL: 086 server = serversToBeKilled.remove(); 087 try { 088 killRs(server); 089 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 090 // We've seen this in test runs where we timeout but the kill went through. HBASE-9743 091 // So, add to deadServers even if exception so the start gets called. 092 LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e); 093 } 094 deadServers.add(server); 095 break; 096 case START: 097 try { 098 server = deadServers.remove(); 099 startRs(server); 100 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 101 // The start may fail but better to just keep going though we may lose server. 102 // 103 LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e); 104 } 105 break; 106 } 107 108 sleep(RandomUtils.nextInt(0, (int)sleepTime)); 109 } 110 } 111 112 protected List<ServerName> selectServers() throws IOException { 113 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); 114 } 115 116 /** 117 * Small test to ensure the class basically works. 118 * @param args 119 * @throws Exception 120 */ 121 public static void main(final String[] args) throws Exception { 122 RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) { 123 private int invocations = 0; 124 @Override 125 protected ServerName[] getCurrentServers() throws IOException { 126 final int count = 4; 127 List<ServerName> serverNames = new ArrayList<>(count); 128 for (int i = 0; i < 4; i++) { 129 serverNames.add(ServerName.valueOf(i + ".example.org", i, i)); 130 } 131 return serverNames.toArray(new ServerName[serverNames.size()]); 132 } 133 134 @Override 135 protected void killRs(ServerName server) throws IOException { 136 LOG.info("Killed " + server); 137 if (this.invocations++ % 3 == 0) { 138 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 139 } 140 } 141 142 @Override 143 protected void startRs(ServerName server) throws IOException { 144 LOG.info("Started " + server); 145 if (this.invocations++ % 3 == 0) { 146 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 147 } 148 } 149 }; 150 151 action.perform(); 152 } 153}