001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.io.IOException; 022import java.util.ArrayList; 023import java.util.LinkedList; 024import java.util.List; 025import java.util.Queue; 026 027import org.apache.commons.lang3.RandomUtils; 028import org.apache.hadoop.hbase.ServerName; 029import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033/** 034 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a 035 * server, or starts one, sleeping randomly (0-sleepTime) in between steps. The parameter maxDeadServers 036 * limits the maximum number of servers that can be down at the same time during rolling restarts. 037 */ 038public class RollingBatchRestartRsAction extends BatchRestartRsAction { 039 private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class); 040 protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5 041 042 public RollingBatchRestartRsAction(long sleepTime, float ratio) { 043 this(sleepTime, ratio, 5); 044 } 045 046 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) { 047 super(sleepTime, ratio); 048 this.maxDeadServers = maxDeadServers; 049 } 050 051 enum KillOrStart { 052 KILL, 053 START 054 } 055 056 @Override 057 public void perform() throws Exception { 058 LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers", 059 (int)(ratio * 100))); 060 List<ServerName> selectedServers = selectServers(); 061 062 Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers); 063 Queue<ServerName> deadServers = new LinkedList<>(); 064 065 // loop while there are servers to be killed or dead servers to be restarted 066 while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) { 067 KillOrStart action = KillOrStart.KILL; 068 069 if (serversToBeKilled.isEmpty()) { // no more servers to kill 070 action = KillOrStart.START; 071 } else if (deadServers.isEmpty()) { 072 action = KillOrStart.KILL; // no more servers to start 073 } else if (deadServers.size() >= maxDeadServers) { 074 // we have too many dead servers. Don't kill any more 075 action = KillOrStart.START; 076 } else { 077 // do a coin toss 078 action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START; 079 } 080 081 ServerName server; 082 083 switch (action) { 084 case KILL: 085 server = serversToBeKilled.remove(); 086 try { 087 killRs(server); 088 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 089 // We've seen this in test runs where we timeout but the kill went through. HBASE-9743 090 // So, add to deadServers even if exception so the start gets called. 091 LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e); 092 } 093 deadServers.add(server); 094 break; 095 case START: 096 try { 097 server = deadServers.remove(); 098 startRs(server); 099 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 100 // The start may fail but better to just keep going though we may lose server. 101 // 102 LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e); 103 } 104 break; 105 } 106 107 sleep(RandomUtils.nextInt(0, (int)sleepTime)); 108 } 109 } 110 111 protected List<ServerName> selectServers() throws IOException { 112 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); 113 } 114 115 /** 116 * Small test to ensure the class basically works. 117 * @param args 118 * @throws Exception 119 */ 120 public static void main(final String[] args) throws Exception { 121 RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) { 122 private int invocations = 0; 123 @Override 124 protected ServerName[] getCurrentServers() throws IOException { 125 final int count = 4; 126 List<ServerName> serverNames = new ArrayList<>(count); 127 for (int i = 0; i < 4; i++) { 128 serverNames.add(ServerName.valueOf(i + ".example.org", i, i)); 129 } 130 return serverNames.toArray(new ServerName[serverNames.size()]); 131 } 132 133 @Override 134 protected void killRs(ServerName server) throws IOException { 135 LOG.info("Killed " + server); 136 if (this.invocations++ % 3 == 0) { 137 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 138 } 139 } 140 141 @Override 142 protected void startRs(ServerName server) throws IOException { 143 LOG.info("Started " + server); 144 if (this.invocations++ % 3 == 0) { 145 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 146 } 147 } 148 }; 149 150 action.perform(); 151 } 152}