001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.chaos.actions; 019 020import java.io.IOException; 021import java.util.ArrayDeque; 022import java.util.ArrayList; 023import java.util.Collections; 024import java.util.LinkedList; 025import java.util.List; 026import java.util.Objects; 027import java.util.Queue; 028import java.util.Random; 029import java.util.concurrent.ThreadLocalRandom; 030import org.apache.hadoop.hbase.ServerName; 031import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035/** 036 * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a server, 037 * or starts one, sleeping randomly (0-sleepTime) in between steps. The parameter maxDeadServers 038 * limits the maximum number of servers that can be down at the same time during rolling restarts. 039 */ 040public class RollingBatchRestartRsAction extends BatchRestartRsAction { 041 private static final Logger LOG = LoggerFactory.getLogger(RollingBatchRestartRsAction.class); 042 protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5 043 044 public RollingBatchRestartRsAction(long sleepTime, float ratio) { 045 this(sleepTime, ratio, 5); 046 } 047 048 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) { 049 super(sleepTime, ratio); 050 this.maxDeadServers = maxDeadServers; 051 } 052 053 public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers, 054 boolean skipMetaRS) { 055 this(sleepTime, ratio, maxDeadServers); 056 this.skipMetaRS = skipMetaRS; 057 } 058 059 enum KillOrStart { 060 KILL, 061 START 062 } 063 064 @Override 065 protected Logger getLogger() { 066 return LOG; 067 } 068 069 @Override 070 // deadServers is both list and queue here, a valid use case for LinkedList 071 @SuppressWarnings("JdkObsolete") 072 public void perform() throws Exception { 073 getLogger().info("Performing action: Rolling batch restarting {}% of region servers", 074 (int) (ratio * 100)); 075 List<ServerName> selectedServers = selectServers(); 076 Queue<ServerName> serversToBeKilled = new ArrayDeque<>(selectedServers); 077 LinkedList<ServerName> deadServers = new LinkedList<>(); 078 Random rand = ThreadLocalRandom.current(); 079 // loop while there are servers to be killed or dead servers to be restarted 080 while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) { 081 082 final KillOrStart action; 083 if (serversToBeKilled.isEmpty()) { // no more servers to kill 084 action = KillOrStart.START; 085 } else if (deadServers.isEmpty()) { 086 action = KillOrStart.KILL; // no more servers to start 087 } else if (deadServers.size() >= maxDeadServers) { 088 // we have too many dead servers. Don't kill any more 089 action = KillOrStart.START; 090 } else { 091 // do a coin toss 092 action = rand.nextBoolean() ? KillOrStart.KILL : KillOrStart.START; 093 } 094 095 ServerName server; 096 097 switch (action) { 098 case KILL: 099 server = serversToBeKilled.remove(); 100 try { 101 killRs(server); 102 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 103 // We've seen this in test runs where we timeout but the kill went through. HBASE-9743 104 // So, add to deadServers even if exception so the start gets called. 105 getLogger().info("Problem killing but presume successful; code={}", e.getExitCode(), e); 106 } 107 deadServers.add(server); 108 break; 109 case START: 110 server = Objects.requireNonNull(deadServers.peek()); 111 try { 112 startRs(server); 113 // only remove the server from the known dead list if `startRs` succeeds. 114 deadServers.remove(server); 115 } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { 116 // The start may fail but better to just keep going though we may lose server. 117 // Shuffle the dead list to avoid getting stuck on a single stubborn host. 118 Collections.shuffle(deadServers); 119 getLogger().info("Problem starting {}, will retry; code={}", server, e.getExitCode(), 120 e); 121 } 122 break; 123 } 124 125 sleep(rand.nextInt((int) sleepTime)); 126 } 127 } 128 129 protected List<ServerName> selectServers() throws IOException { 130 return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); 131 } 132 133 /** 134 * Small test to ensure the class basically works. 135 */ 136 public static void main(final String[] args) throws Exception { 137 RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) { 138 private int invocations = 0; 139 140 @Override 141 protected ServerName[] getCurrentServers() { 142 final int count = 4; 143 List<ServerName> serverNames = new ArrayList<>(count); 144 for (int i = 0; i < 4; i++) { 145 serverNames.add(ServerName.valueOf(i + ".example.org", i, i)); 146 } 147 return serverNames.toArray(new ServerName[0]); 148 } 149 150 @Override 151 protected void killRs(ServerName server) throws IOException { 152 LOG.info("Killed {}", server); 153 if (this.invocations++ % 3 == 0) { 154 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 155 } 156 } 157 158 @Override 159 protected void startRs(ServerName server) throws IOException { 160 LOG.info("Started {}", server); 161 if (this.invocations++ % 3 == 0) { 162 throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); 163 } 164 } 165 }; 166 167 action.perform(); 168 } 169}