001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hbase.chaos.actions; 020 021import java.io.IOException; 022import java.util.ArrayList; 023import java.util.Collections; 024import java.util.LinkedList; 025import java.util.List; 026import java.util.Objects; 027import java.util.Queue; 028import org.apache.commons.lang3.RandomUtils; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034/** 035 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a 036 * server, or starts one, sleeping randomly (0-sleepTime) in between steps. 037 * The parameter maxDeadServers limits the maximum number of servers that 038 * can be down at the same time during rolling restarts. 039 */ 040public class RollingBatchRestartRsAction extends BatchRestartRsAction { 041 private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class); 042 protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5 043 044 public RollingBatchRestartRsAction(long sleepTime, float ratio) { 045 this(sleepTime, ratio, 5); 046 } 047 048 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) { 049 super(sleepTime, ratio); 050 this.maxDeadServers = maxDeadServers; 051 } 052 053 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers, 054 boolean skipMetaRS) { 055 this(sleepTime, ratio, maxDeadServers); 056 this.skipMetaRS = skipMetaRS; 057 } 058 059 enum KillOrStart { 060 KILL, 061 START 062 } 063 064 @Override protected Logger getLogger() { 065 return LOG; 066 } 067 068 @Override 069 public void perform() throws Exception { 070 getLogger().info("Performing action: Rolling batch restarting {}% of region servers", 071 (int)(ratio * 100)); 072 List<ServerName> selectedServers = selectServers(); 073 074 Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers); 075 LinkedList<ServerName> deadServers = new LinkedList<>(); 076 077 // loop while there are servers to be killed or dead servers to be restarted 078 while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) { 079 080 final KillOrStart action; 081 if (serversToBeKilled.isEmpty()) { // no more servers to kill 082 action = KillOrStart.START; 083 } else if (deadServers.isEmpty()) { 084 action = KillOrStart.KILL; // no more servers to start 085 } else if (deadServers.size() >= maxDeadServers) { 086 // we have too many dead servers. Don't kill any more 087 action = KillOrStart.START; 088 } else { 089 // do a coin toss 090 action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START; 091 } 092 093 ServerName server; 094 095 switch (action) { 096 case KILL: 097 server = serversToBeKilled.remove(); 098 try { 099 killRs(server); 100 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 101 // We've seen this in test runs where we timeout but the kill went through. HBASE-9743 102 // So, add to deadServers even if exception so the start gets called. 103 getLogger().info("Problem killing but presume successful; code={}", e.getExitCode(), e); 104 } 105 deadServers.add(server); 106 break; 107 case START: 108 server = Objects.requireNonNull(deadServers.peek()); 109 try { 110 startRs(server); 111 // only remove the server from the known dead list if `startRs` succeeds. 112 deadServers.remove(server); 113 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 114 // The start may fail but better to just keep going though we may lose server. 115 // Shuffle the dead list to avoid getting stuck on a single stubborn host. 116 Collections.shuffle(deadServers); 117 getLogger().info( 118 "Problem starting {}, will retry; code={}", server, e.getExitCode(), e); 119 } 120 break; 121 } 122 123 sleep(RandomUtils.nextInt(0, (int)sleepTime)); 124 } 125 } 126 127 protected List<ServerName> selectServers() throws IOException { 128 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); 129 } 130 131 /** 132 * Small test to ensure the class basically works. 133 */ 134 public static void main(final String[] args) throws Exception { 135 RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) { 136 private int invocations = 0; 137 @Override 138 protected ServerName[] getCurrentServers() { 139 final int count = 4; 140 List<ServerName> serverNames = new ArrayList<>(count); 141 for (int i = 0; i < 4; i++) { 142 serverNames.add(ServerName.valueOf(i + ".example.org", i, i)); 143 } 144 return serverNames.toArray(new ServerName[0]); 145 } 146 147 @Override 148 protected void killRs(ServerName server) throws IOException { 149 LOG.info("Killed {}", server); 150 if (this.invocations++ % 3 == 0) { 151 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 152 } 153 } 154 155 @Override 156 protected void startRs(ServerName server) throws IOException { 157 LOG.info("Started {}", server); 158 if (this.invocations++ % 3 == 0) { 159 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 160 } 161 } 162 }; 163 164 action.perform(); 165 } 166}