001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.chaos.actions; 019 020import java.io.IOException; 021import java.util.ArrayList; 022import java.util.Collection; 023import java.util.HashSet; 024import java.util.List; 025import java.util.Map; 026import java.util.Properties; 027import java.util.Random; 028import java.util.Set; 029import java.util.concurrent.ThreadLocalRandom; 030import java.util.function.BiConsumer; 031import java.util.function.Consumer; 032import org.apache.hadoop.conf.Configuration; 033import org.apache.hadoop.hbase.ClusterMetrics; 034import org.apache.hadoop.hbase.HBaseCluster; 035import org.apache.hadoop.hbase.HBaseTestingUtility; 036import org.apache.hadoop.hbase.IntegrationTestBase; 037import org.apache.hadoop.hbase.IntegrationTestingUtility; 038import org.apache.hadoop.hbase.MiniHBaseCluster; 039import org.apache.hadoop.hbase.ServerMetrics; 040import org.apache.hadoop.hbase.ServerName; 041import org.apache.hadoop.hbase.TableName; 042import org.apache.hadoop.hbase.chaos.factories.MonkeyConstants; 043import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 044import org.apache.hadoop.hbase.client.Admin; 045import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; 046import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 047import org.apache.hadoop.hbase.client.RegionInfo; 048import org.apache.hadoop.hbase.client.TableDescriptor; 049import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 050import org.apache.hadoop.hbase.util.Bytes; 051import org.slf4j.Logger; 052 053/** 054 * A (possibly mischievous) action that the ChaosMonkey can perform. 055 */ 056public abstract class Action { 057 058 public static final String KILL_MASTER_TIMEOUT_KEY = "hbase.chaosmonkey.action.killmastertimeout"; 059 public static final String START_MASTER_TIMEOUT_KEY = 060 "hbase.chaosmonkey.action.startmastertimeout"; 061 public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout"; 062 public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout"; 063 public static final String KILL_ZK_NODE_TIMEOUT_KEY = 064 "hbase.chaosmonkey.action.killzknodetimeout"; 065 public static final String START_ZK_NODE_TIMEOUT_KEY = 066 "hbase.chaosmonkey.action.startzknodetimeout"; 067 public static final String KILL_DATANODE_TIMEOUT_KEY = 068 "hbase.chaosmonkey.action.killdatanodetimeout"; 069 public static final String START_DATANODE_TIMEOUT_KEY = 070 "hbase.chaosmonkey.action.startdatanodetimeout"; 071 public static final String KILL_NAMENODE_TIMEOUT_KEY = 072 "hbase.chaosmonkey.action.killnamenodetimeout"; 073 public static final String START_NAMENODE_TIMEOUT_KEY = 074 "hbase.chaosmonkey.action.startnamenodetimeout"; 075 076 protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 077 protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 078 protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 079 protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 080 protected static final long KILL_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 081 protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 082 protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 083 protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 084 protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 085 protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; 086 087 protected ActionContext context; 088 protected HBaseCluster cluster; 089 protected ClusterMetrics initialStatus; 090 protected ServerName[] initialServers; 091 protected Properties monkeyProps; 092 093 protected long killMasterTimeout; 094 protected long startMasterTimeout; 095 protected long killRsTimeout; 096 protected long startRsTimeout; 097 protected long killZkNodeTimeout; 098 protected long startZkNodeTimeout; 099 protected long killDataNodeTimeout; 100 protected long startDataNodeTimeout; 101 protected long killNameNodeTimeout; 102 protected long startNameNodeTimeout; 103 protected boolean skipMetaRS; 104 105 /** 106 * Retrieve the instance's {@link Logger}, for use throughout the class hierarchy. 107 */ 108 protected abstract Logger getLogger(); 109 110 public void init(ActionContext context) throws IOException { 111 this.context = context; 112 cluster = context.getHBaseCluster(); 113 initialStatus = cluster.getInitialClusterMetrics(); 114 Collection<ServerName> regionServers = initialStatus.getLiveServerMetrics().keySet(); 115 initialServers = regionServers.toArray(new ServerName[0]); 116 117 monkeyProps = context.getMonkeyProps(); 118 if (monkeyProps == null) { 119 monkeyProps = new Properties(); 120 IntegrationTestBase.loadMonkeyProperties(monkeyProps, cluster.getConf()); 121 } 122 123 killMasterTimeout = Long.parseLong( 124 monkeyProps.getProperty(KILL_MASTER_TIMEOUT_KEY, KILL_MASTER_TIMEOUT_DEFAULT + "")); 125 startMasterTimeout = Long.parseLong( 126 monkeyProps.getProperty(START_MASTER_TIMEOUT_KEY, START_MASTER_TIMEOUT_DEFAULT + "")); 127 killRsTimeout = 128 Long.parseLong(monkeyProps.getProperty(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT + "")); 129 startRsTimeout = 130 Long.parseLong(monkeyProps.getProperty(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT + "")); 131 killZkNodeTimeout = Long.parseLong( 132 monkeyProps.getProperty(KILL_ZK_NODE_TIMEOUT_KEY, KILL_ZK_NODE_TIMEOUT_DEFAULT + "")); 133 startZkNodeTimeout = Long.parseLong( 134 monkeyProps.getProperty(START_ZK_NODE_TIMEOUT_KEY, START_ZK_NODE_TIMEOUT_DEFAULT + "")); 135 killDataNodeTimeout = Long.parseLong( 136 monkeyProps.getProperty(KILL_DATANODE_TIMEOUT_KEY, KILL_DATANODE_TIMEOUT_DEFAULT + "")); 137 startDataNodeTimeout = Long.parseLong( 138 monkeyProps.getProperty(START_DATANODE_TIMEOUT_KEY, START_DATANODE_TIMEOUT_DEFAULT + "")); 139 killNameNodeTimeout = Long.parseLong( 140 monkeyProps.getProperty(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT + "")); 141 startNameNodeTimeout = Long.parseLong( 142 monkeyProps.getProperty(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT + "")); 143 skipMetaRS = Boolean.parseBoolean(monkeyProps.getProperty(MonkeyConstants.SKIP_META_RS, 144 MonkeyConstants.DEFAULT_SKIP_META_RS + "")); 145 } 146 147 public void perform() throws Exception { 148 } 149 150 /** Returns current region servers - active master */ 151 protected ServerName[] getCurrentServers() throws IOException { 152 ClusterMetrics clusterStatus = cluster.getClusterMetrics(); 153 Collection<ServerName> regionServers = clusterStatus.getLiveServerMetrics().keySet(); 154 int count = regionServers.size(); 155 if (count <= 0) { 156 return new ServerName[] {}; 157 } 158 ServerName master = clusterStatus.getMasterName(); 159 Set<ServerName> masters = new HashSet<>(); 160 masters.add(master); 161 masters.addAll(clusterStatus.getBackupMasterNames()); 162 ArrayList<ServerName> tmp = new ArrayList<>(count); 163 tmp.addAll(regionServers); 164 tmp.removeAll(masters); 165 166 if (skipMetaRS) { 167 ServerName metaServer = cluster.getServerHoldingMeta(); 168 tmp.remove(metaServer); 169 } 170 171 return tmp.toArray(new ServerName[0]); 172 } 173 174 protected void killMaster(ServerName server) throws IOException { 175 getLogger().info("Killing master {}", server); 176 cluster.killMaster(server); 177 cluster.waitForMasterToStop(server, killMasterTimeout); 178 getLogger().info("Killed master " + server); 179 } 180 181 protected void startMaster(ServerName server) throws IOException { 182 getLogger().info("Starting master {}", server.getHostname()); 183 cluster.startMaster(server.getHostname(), server.getPort()); 184 cluster.waitForActiveAndReadyMaster(startMasterTimeout); 185 getLogger().info("Started master " + server.getHostname()); 186 } 187 188 protected void stopRs(ServerName server) throws IOException { 189 getLogger().info("Stopping regionserver {}", server); 190 cluster.stopRegionServer(server); 191 cluster.waitForRegionServerToStop(server, killRsTimeout); 192 getLogger().info("Stopping regionserver {}. Reported num of rs:{}", server, 193 cluster.getClusterMetrics().getLiveServerMetrics().size()); 194 } 195 196 protected void suspendRs(ServerName server) throws IOException { 197 getLogger().info("Suspending regionserver {}", server); 198 cluster.suspendRegionServer(server); 199 if (!(cluster instanceof MiniHBaseCluster)) { 200 cluster.waitForRegionServerToStop(server, killRsTimeout); 201 } 202 getLogger().info("Suspending regionserver {}. Reported num of rs:{}", server, 203 cluster.getClusterMetrics().getLiveServerMetrics().size()); 204 } 205 206 protected void resumeRs(ServerName server) throws IOException { 207 getLogger().info("Resuming regionserver {}", server); 208 cluster.resumeRegionServer(server); 209 if (!(cluster instanceof MiniHBaseCluster)) { 210 cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout); 211 } 212 getLogger().info("Resuming regionserver {}. Reported num of rs:{}", server, 213 cluster.getClusterMetrics().getLiveServerMetrics().size()); 214 } 215 216 protected void killRs(ServerName server) throws IOException { 217 getLogger().info("Killing regionserver {}", server); 218 cluster.killRegionServer(server); 219 cluster.waitForRegionServerToStop(server, killRsTimeout); 220 getLogger().info("Killed regionserver {}. Reported num of rs:{}", server, 221 cluster.getClusterMetrics().getLiveServerMetrics().size()); 222 } 223 224 protected void startRs(ServerName server) throws IOException { 225 getLogger().info("Starting regionserver {}", server.getAddress()); 226 cluster.startRegionServer(server.getHostname(), server.getPort()); 227 cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout); 228 getLogger().info("Started regionserver {}. Reported num of rs:{}", server.getAddress(), 229 cluster.getClusterMetrics().getLiveServerMetrics().size()); 230 } 231 232 protected void killZKNode(ServerName server) throws IOException { 233 getLogger().info("Killing zookeeper node {}", server); 234 cluster.killZkNode(server); 235 cluster.waitForZkNodeToStop(server, killZkNodeTimeout); 236 getLogger().info("Killed zookeeper node {}. Reported num of rs:{}", server, 237 cluster.getClusterMetrics().getLiveServerMetrics().size()); 238 } 239 240 protected void startZKNode(ServerName server) throws IOException { 241 getLogger().info("Starting zookeeper node {}", server.getHostname()); 242 cluster.startZkNode(server.getHostname(), server.getPort()); 243 cluster.waitForZkNodeToStart(server, startZkNodeTimeout); 244 getLogger().info("Started zookeeper node {}", server); 245 } 246 247 protected void killDataNode(ServerName server) throws IOException { 248 getLogger().info("Killing datanode {}", server); 249 cluster.killDataNode(server); 250 cluster.waitForDataNodeToStop(server, killDataNodeTimeout); 251 getLogger().info("Killed datanode {}. Reported num of rs:{}", server, 252 cluster.getClusterMetrics().getLiveServerMetrics().size()); 253 } 254 255 protected void startDataNode(ServerName server) throws IOException { 256 getLogger().info("Starting datanode {}", server.getHostname()); 257 cluster.startDataNode(server); 258 cluster.waitForDataNodeToStart(server, startDataNodeTimeout); 259 getLogger().info("Started datanode {}", server); 260 } 261 262 protected void killNameNode(ServerName server) throws IOException { 263 getLogger().info("Killing namenode :-{}", server.getHostname()); 264 cluster.killNameNode(server); 265 cluster.waitForNameNodeToStop(server, killNameNodeTimeout); 266 getLogger().info("Killed namenode:{}. Reported num of rs:{}", server, 267 cluster.getClusterMetrics().getLiveServerMetrics().size()); 268 } 269 270 protected void startNameNode(ServerName server) throws IOException { 271 getLogger().info("Starting Namenode :-{}", server.getHostname()); 272 cluster.startNameNode(server); 273 cluster.waitForNameNodeToStart(server, startNameNodeTimeout); 274 getLogger().info("Started namenode:{}", server); 275 } 276 277 protected void unbalanceRegions(ClusterMetrics clusterStatus, List<ServerName> fromServers, 278 List<ServerName> toServers, double fractionOfRegions) throws Exception { 279 List<byte[]> victimRegions = new ArrayList<>(); 280 for (Map.Entry<ServerName, ServerMetrics> entry : clusterStatus.getLiveServerMetrics() 281 .entrySet()) { 282 ServerName sn = entry.getKey(); 283 ServerMetrics serverLoad = entry.getValue(); 284 // Ugh. 285 List<byte[]> regions = new ArrayList<>(serverLoad.getRegionMetrics().keySet()); 286 int victimRegionCount = (int) Math.ceil(fractionOfRegions * regions.size()); 287 getLogger().debug("Removing {} regions from {}", victimRegionCount, sn); 288 Random rand = ThreadLocalRandom.current(); 289 for (int i = 0; i < victimRegionCount; ++i) { 290 int victimIx = rand.nextInt(regions.size()); 291 String regionId = RegionInfo.encodeRegionName(regions.remove(victimIx)); 292 victimRegions.add(Bytes.toBytes(regionId)); 293 } 294 } 295 296 getLogger().info("Moving {} regions from {} servers to {} different servers", 297 victimRegions.size(), fromServers.size(), toServers.size()); 298 Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin(); 299 Random rand = ThreadLocalRandom.current(); 300 for (byte[] victimRegion : victimRegions) { 301 // Don't keep moving regions if we're 302 // trying to stop the monkey. 303 if (context.isStopping()) { 304 break; 305 } 306 int targetIx = rand.nextInt(toServers.size()); 307 admin.move(victimRegion, toServers.get(targetIx)); 308 } 309 } 310 311 protected void forceBalancer() throws Exception { 312 Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin(); 313 boolean result = false; 314 try { 315 result = admin.balancer(); 316 } catch (Exception e) { 317 getLogger().warn("Got exception while doing balance ", e); 318 } 319 if (!result) { 320 getLogger().error("Balancer didn't succeed"); 321 } 322 } 323 324 protected void setBalancer(boolean onOrOff, boolean synchronous) throws Exception { 325 Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin(); 326 try { 327 admin.balancerSwitch(onOrOff, synchronous); 328 } catch (Exception e) { 329 getLogger().warn("Got exception while switching balance ", e); 330 } 331 } 332 333 public Configuration getConf() { 334 return cluster.getConf(); 335 } 336 337 /** 338 * Apply a transform to all columns in a given table. If there are no columns in a table or if the 339 * context is stopping does nothing. 340 * @param tableName the table to modify 341 * @param transform the modification to perform. Callers will have the column name as a string and 342 * a column family builder available to them 343 */ 344 protected void modifyAllTableColumns(TableName tableName, 345 BiConsumer<String, ColumnFamilyDescriptorBuilder> transform) throws IOException { 346 HBaseTestingUtility util = this.context.getHBaseIntegrationTestingUtility(); 347 Admin admin = util.getAdmin(); 348 349 TableDescriptor tableDescriptor = admin.getDescriptor(tableName); 350 ColumnFamilyDescriptor[] columnDescriptors = tableDescriptor.getColumnFamilies(); 351 352 if (columnDescriptors == null || columnDescriptors.length == 0) { 353 return; 354 } 355 356 TableDescriptorBuilder builder = TableDescriptorBuilder.newBuilder(tableDescriptor); 357 for (ColumnFamilyDescriptor descriptor : columnDescriptors) { 358 ColumnFamilyDescriptorBuilder cfd = ColumnFamilyDescriptorBuilder.newBuilder(descriptor); 359 transform.accept(descriptor.getNameAsString(), cfd); 360 builder.modifyColumnFamily(cfd.build()); 361 } 362 363 // Don't try the modify if we're stopping 364 if (this.context.isStopping()) { 365 return; 366 } 367 admin.modifyTable(builder.build()); 368 } 369 370 /** 371 * Apply a transform to all columns in a given table. If there are no columns in a table or if the 372 * context is stopping does nothing. 373 * @param tableName the table to modify 374 * @param transform the modification to perform on each column family descriptor builder 375 */ 376 protected void modifyAllTableColumns(TableName tableName, 377 Consumer<ColumnFamilyDescriptorBuilder> transform) throws IOException { 378 modifyAllTableColumns(tableName, (name, cfd) -> transform.accept(cfd)); 379 } 380 381 /** 382 * Context for Action's 383 */ 384 public static class ActionContext { 385 private IntegrationTestingUtility util; 386 private Properties monkeyProps = null; 387 388 public ActionContext(IntegrationTestingUtility util) { 389 this.util = util; 390 } 391 392 public ActionContext(Properties monkeyProps, IntegrationTestingUtility util) { 393 this.util = util; 394 this.monkeyProps = monkeyProps; 395 } 396 397 public Properties getMonkeyProps() { 398 return monkeyProps; 399 } 400 401 public IntegrationTestingUtility getHBaseIntegrationTestingUtility() { 402 return util; 403 } 404 405 public HBaseCluster getHBaseCluster() { 406 return util.getHBaseClusterInterface(); 407 } 408 409 public boolean isStopping() { 410 return false; 411 } 412 } 413}