001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.chaos.actions;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.Collection;
023import java.util.HashSet;
024import java.util.List;
025import java.util.Map;
026import java.util.Properties;
027import java.util.Random;
028import java.util.Set;
029import java.util.concurrent.ThreadLocalRandom;
030import java.util.function.BiConsumer;
031import java.util.function.Consumer;
032import org.apache.hadoop.conf.Configuration;
033import org.apache.hadoop.hbase.ClusterMetrics;
034import org.apache.hadoop.hbase.HBaseCluster;
035import org.apache.hadoop.hbase.HBaseTestingUtility;
036import org.apache.hadoop.hbase.IntegrationTestBase;
037import org.apache.hadoop.hbase.IntegrationTestingUtility;
038import org.apache.hadoop.hbase.MiniHBaseCluster;
039import org.apache.hadoop.hbase.ServerMetrics;
040import org.apache.hadoop.hbase.ServerName;
041import org.apache.hadoop.hbase.TableName;
042import org.apache.hadoop.hbase.chaos.factories.MonkeyConstants;
043import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
044import org.apache.hadoop.hbase.client.Admin;
045import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
046import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
047import org.apache.hadoop.hbase.client.RegionInfo;
048import org.apache.hadoop.hbase.client.TableDescriptor;
049import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
050import org.apache.hadoop.hbase.util.Bytes;
051import org.slf4j.Logger;
052
053/**
054 * A (possibly mischievous) action that the ChaosMonkey can perform.
055 */
056public abstract class Action {
057
058  public static final String KILL_MASTER_TIMEOUT_KEY = "hbase.chaosmonkey.action.killmastertimeout";
059  public static final String START_MASTER_TIMEOUT_KEY =
060    "hbase.chaosmonkey.action.startmastertimeout";
061  public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
062  public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
063  public static final String KILL_ZK_NODE_TIMEOUT_KEY =
064    "hbase.chaosmonkey.action.killzknodetimeout";
065  public static final String START_ZK_NODE_TIMEOUT_KEY =
066    "hbase.chaosmonkey.action.startzknodetimeout";
067  public static final String KILL_DATANODE_TIMEOUT_KEY =
068    "hbase.chaosmonkey.action.killdatanodetimeout";
069  public static final String START_DATANODE_TIMEOUT_KEY =
070    "hbase.chaosmonkey.action.startdatanodetimeout";
071  public static final String KILL_NAMENODE_TIMEOUT_KEY =
072    "hbase.chaosmonkey.action.killnamenodetimeout";
073  public static final String START_NAMENODE_TIMEOUT_KEY =
074    "hbase.chaosmonkey.action.startnamenodetimeout";
075
076  protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
077  protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
078  protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
079  protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
080  protected static final long KILL_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
081  protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
082  protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
083  protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
084  protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
085  protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
086
087  protected ActionContext context;
088  protected HBaseCluster cluster;
089  protected ClusterMetrics initialStatus;
090  protected ServerName[] initialServers;
091  protected Properties monkeyProps;
092
093  protected long killMasterTimeout;
094  protected long startMasterTimeout;
095  protected long killRsTimeout;
096  protected long startRsTimeout;
097  protected long killZkNodeTimeout;
098  protected long startZkNodeTimeout;
099  protected long killDataNodeTimeout;
100  protected long startDataNodeTimeout;
101  protected long killNameNodeTimeout;
102  protected long startNameNodeTimeout;
103  protected boolean skipMetaRS;
104
105  /**
106   * Retrieve the instance's {@link Logger}, for use throughout the class hierarchy.
107   */
108  protected abstract Logger getLogger();
109
110  public void init(ActionContext context) throws IOException {
111    this.context = context;
112    cluster = context.getHBaseCluster();
113    initialStatus = cluster.getInitialClusterMetrics();
114    Collection<ServerName> regionServers = initialStatus.getLiveServerMetrics().keySet();
115    initialServers = regionServers.toArray(new ServerName[0]);
116
117    monkeyProps = context.getMonkeyProps();
118    if (monkeyProps == null) {
119      monkeyProps = new Properties();
120      IntegrationTestBase.loadMonkeyProperties(monkeyProps, cluster.getConf());
121    }
122
123    killMasterTimeout = Long.parseLong(
124      monkeyProps.getProperty(KILL_MASTER_TIMEOUT_KEY, KILL_MASTER_TIMEOUT_DEFAULT + ""));
125    startMasterTimeout = Long.parseLong(
126      monkeyProps.getProperty(START_MASTER_TIMEOUT_KEY, START_MASTER_TIMEOUT_DEFAULT + ""));
127    killRsTimeout =
128      Long.parseLong(monkeyProps.getProperty(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT + ""));
129    startRsTimeout =
130      Long.parseLong(monkeyProps.getProperty(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT + ""));
131    killZkNodeTimeout = Long.parseLong(
132      monkeyProps.getProperty(KILL_ZK_NODE_TIMEOUT_KEY, KILL_ZK_NODE_TIMEOUT_DEFAULT + ""));
133    startZkNodeTimeout = Long.parseLong(
134      monkeyProps.getProperty(START_ZK_NODE_TIMEOUT_KEY, START_ZK_NODE_TIMEOUT_DEFAULT + ""));
135    killDataNodeTimeout = Long.parseLong(
136      monkeyProps.getProperty(KILL_DATANODE_TIMEOUT_KEY, KILL_DATANODE_TIMEOUT_DEFAULT + ""));
137    startDataNodeTimeout = Long.parseLong(
138      monkeyProps.getProperty(START_DATANODE_TIMEOUT_KEY, START_DATANODE_TIMEOUT_DEFAULT + ""));
139    killNameNodeTimeout = Long.parseLong(
140      monkeyProps.getProperty(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT + ""));
141    startNameNodeTimeout = Long.parseLong(
142      monkeyProps.getProperty(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT + ""));
143    skipMetaRS = Boolean.parseBoolean(monkeyProps.getProperty(MonkeyConstants.SKIP_META_RS,
144      MonkeyConstants.DEFAULT_SKIP_META_RS + ""));
145  }
146
147  public void perform() throws Exception {
148  }
149
150  /** Returns current region servers - active master */
151  protected ServerName[] getCurrentServers() throws IOException {
152    ClusterMetrics clusterStatus = cluster.getClusterMetrics();
153    Collection<ServerName> regionServers = clusterStatus.getLiveServerMetrics().keySet();
154    int count = regionServers.size();
155    if (count <= 0) {
156      return new ServerName[] {};
157    }
158    ServerName master = clusterStatus.getMasterName();
159    Set<ServerName> masters = new HashSet<>();
160    masters.add(master);
161    masters.addAll(clusterStatus.getBackupMasterNames());
162    ArrayList<ServerName> tmp = new ArrayList<>(count);
163    tmp.addAll(regionServers);
164    tmp.removeAll(masters);
165
166    if (skipMetaRS) {
167      ServerName metaServer = cluster.getServerHoldingMeta();
168      tmp.remove(metaServer);
169    }
170
171    return tmp.toArray(new ServerName[0]);
172  }
173
174  protected void killMaster(ServerName server) throws IOException {
175    getLogger().info("Killing master {}", server);
176    cluster.killMaster(server);
177    cluster.waitForMasterToStop(server, killMasterTimeout);
178    getLogger().info("Killed master " + server);
179  }
180
181  protected void startMaster(ServerName server) throws IOException {
182    getLogger().info("Starting master {}", server.getHostname());
183    cluster.startMaster(server.getHostname(), server.getPort());
184    cluster.waitForActiveAndReadyMaster(startMasterTimeout);
185    getLogger().info("Started master " + server.getHostname());
186  }
187
188  protected void stopRs(ServerName server) throws IOException {
189    getLogger().info("Stopping regionserver {}", server);
190    cluster.stopRegionServer(server);
191    cluster.waitForRegionServerToStop(server, killRsTimeout);
192    getLogger().info("Stopping regionserver {}. Reported num of rs:{}", server,
193      cluster.getClusterMetrics().getLiveServerMetrics().size());
194  }
195
196  protected void suspendRs(ServerName server) throws IOException {
197    getLogger().info("Suspending regionserver {}", server);
198    cluster.suspendRegionServer(server);
199    if (!(cluster instanceof MiniHBaseCluster)) {
200      cluster.waitForRegionServerToStop(server, killRsTimeout);
201    }
202    getLogger().info("Suspending regionserver {}. Reported num of rs:{}", server,
203      cluster.getClusterMetrics().getLiveServerMetrics().size());
204  }
205
206  protected void resumeRs(ServerName server) throws IOException {
207    getLogger().info("Resuming regionserver {}", server);
208    cluster.resumeRegionServer(server);
209    if (!(cluster instanceof MiniHBaseCluster)) {
210      cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
211    }
212    getLogger().info("Resuming regionserver {}. Reported num of rs:{}", server,
213      cluster.getClusterMetrics().getLiveServerMetrics().size());
214  }
215
216  protected void killRs(ServerName server) throws IOException {
217    getLogger().info("Killing regionserver {}", server);
218    cluster.killRegionServer(server);
219    cluster.waitForRegionServerToStop(server, killRsTimeout);
220    getLogger().info("Killed regionserver {}. Reported num of rs:{}", server,
221      cluster.getClusterMetrics().getLiveServerMetrics().size());
222  }
223
224  protected void startRs(ServerName server) throws IOException {
225    getLogger().info("Starting regionserver {}", server.getAddress());
226    cluster.startRegionServer(server.getHostname(), server.getPort());
227    cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
228    getLogger().info("Started regionserver {}. Reported num of rs:{}", server.getAddress(),
229      cluster.getClusterMetrics().getLiveServerMetrics().size());
230  }
231
232  protected void killZKNode(ServerName server) throws IOException {
233    getLogger().info("Killing zookeeper node {}", server);
234    cluster.killZkNode(server);
235    cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
236    getLogger().info("Killed zookeeper node {}. Reported num of rs:{}", server,
237      cluster.getClusterMetrics().getLiveServerMetrics().size());
238  }
239
240  protected void startZKNode(ServerName server) throws IOException {
241    getLogger().info("Starting zookeeper node {}", server.getHostname());
242    cluster.startZkNode(server.getHostname(), server.getPort());
243    cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
244    getLogger().info("Started zookeeper node {}", server);
245  }
246
247  protected void killDataNode(ServerName server) throws IOException {
248    getLogger().info("Killing datanode {}", server);
249    cluster.killDataNode(server);
250    cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
251    getLogger().info("Killed datanode {}. Reported num of rs:{}", server,
252      cluster.getClusterMetrics().getLiveServerMetrics().size());
253  }
254
255  protected void startDataNode(ServerName server) throws IOException {
256    getLogger().info("Starting datanode {}", server.getHostname());
257    cluster.startDataNode(server);
258    cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
259    getLogger().info("Started datanode {}", server);
260  }
261
262  protected void killNameNode(ServerName server) throws IOException {
263    getLogger().info("Killing namenode :-{}", server.getHostname());
264    cluster.killNameNode(server);
265    cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
266    getLogger().info("Killed namenode:{}. Reported num of rs:{}", server,
267      cluster.getClusterMetrics().getLiveServerMetrics().size());
268  }
269
270  protected void startNameNode(ServerName server) throws IOException {
271    getLogger().info("Starting Namenode :-{}", server.getHostname());
272    cluster.startNameNode(server);
273    cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
274    getLogger().info("Started namenode:{}", server);
275  }
276
277  protected void unbalanceRegions(ClusterMetrics clusterStatus, List<ServerName> fromServers,
278    List<ServerName> toServers, double fractionOfRegions) throws Exception {
279    List<byte[]> victimRegions = new ArrayList<>();
280    for (Map.Entry<ServerName, ServerMetrics> entry : clusterStatus.getLiveServerMetrics()
281      .entrySet()) {
282      ServerName sn = entry.getKey();
283      ServerMetrics serverLoad = entry.getValue();
284      // Ugh.
285      List<byte[]> regions = new ArrayList<>(serverLoad.getRegionMetrics().keySet());
286      int victimRegionCount = (int) Math.ceil(fractionOfRegions * regions.size());
287      getLogger().debug("Removing {} regions from {}", victimRegionCount, sn);
288      Random rand = ThreadLocalRandom.current();
289      for (int i = 0; i < victimRegionCount; ++i) {
290        int victimIx = rand.nextInt(regions.size());
291        String regionId = RegionInfo.encodeRegionName(regions.remove(victimIx));
292        victimRegions.add(Bytes.toBytes(regionId));
293      }
294    }
295
296    getLogger().info("Moving {} regions from {} servers to {} different servers",
297      victimRegions.size(), fromServers.size(), toServers.size());
298    Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin();
299    Random rand = ThreadLocalRandom.current();
300    for (byte[] victimRegion : victimRegions) {
301      // Don't keep moving regions if we're
302      // trying to stop the monkey.
303      if (context.isStopping()) {
304        break;
305      }
306      int targetIx = rand.nextInt(toServers.size());
307      admin.move(victimRegion, toServers.get(targetIx));
308    }
309  }
310
311  protected void forceBalancer() throws Exception {
312    Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin();
313    boolean result = false;
314    try {
315      result = admin.balancer();
316    } catch (Exception e) {
317      getLogger().warn("Got exception while doing balance ", e);
318    }
319    if (!result) {
320      getLogger().error("Balancer didn't succeed");
321    }
322  }
323
324  protected void setBalancer(boolean onOrOff, boolean synchronous) throws Exception {
325    Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin();
326    try {
327      admin.balancerSwitch(onOrOff, synchronous);
328    } catch (Exception e) {
329      getLogger().warn("Got exception while switching balance ", e);
330    }
331  }
332
333  public Configuration getConf() {
334    return cluster.getConf();
335  }
336
337  /**
338   * Apply a transform to all columns in a given table. If there are no columns in a table or if the
339   * context is stopping does nothing.
340   * @param tableName the table to modify
341   * @param transform the modification to perform. Callers will have the column name as a string and
342   *                  a column family builder available to them
343   */
344  protected void modifyAllTableColumns(TableName tableName,
345    BiConsumer<String, ColumnFamilyDescriptorBuilder> transform) throws IOException {
346    HBaseTestingUtility util = this.context.getHBaseIntegrationTestingUtility();
347    Admin admin = util.getAdmin();
348
349    TableDescriptor tableDescriptor = admin.getDescriptor(tableName);
350    ColumnFamilyDescriptor[] columnDescriptors = tableDescriptor.getColumnFamilies();
351
352    if (columnDescriptors == null || columnDescriptors.length == 0) {
353      return;
354    }
355
356    TableDescriptorBuilder builder = TableDescriptorBuilder.newBuilder(tableDescriptor);
357    for (ColumnFamilyDescriptor descriptor : columnDescriptors) {
358      ColumnFamilyDescriptorBuilder cfd = ColumnFamilyDescriptorBuilder.newBuilder(descriptor);
359      transform.accept(descriptor.getNameAsString(), cfd);
360      builder.modifyColumnFamily(cfd.build());
361    }
362
363    // Don't try the modify if we're stopping
364    if (this.context.isStopping()) {
365      return;
366    }
367    admin.modifyTable(builder.build());
368  }
369
370  /**
371   * Apply a transform to all columns in a given table. If there are no columns in a table or if the
372   * context is stopping does nothing.
373   * @param tableName the table to modify
374   * @param transform the modification to perform on each column family descriptor builder
375   */
376  protected void modifyAllTableColumns(TableName tableName,
377    Consumer<ColumnFamilyDescriptorBuilder> transform) throws IOException {
378    modifyAllTableColumns(tableName, (name, cfd) -> transform.accept(cfd));
379  }
380
381  /**
382   * Context for Action's
383   */
384  public static class ActionContext {
385    private IntegrationTestingUtility util;
386    private Properties monkeyProps = null;
387
388    public ActionContext(IntegrationTestingUtility util) {
389      this.util = util;
390    }
391
392    public ActionContext(Properties monkeyProps, IntegrationTestingUtility util) {
393      this.util = util;
394      this.monkeyProps = monkeyProps;
395    }
396
397    public Properties getMonkeyProps() {
398      return monkeyProps;
399    }
400
401    public IntegrationTestingUtility getHBaseIntegrationTestingUtility() {
402      return util;
403    }
404
405    public HBaseCluster getHBaseCluster() {
406      return util.getHBaseClusterInterface();
407    }
408
409    public boolean isStopping() {
410      return false;
411    }
412  }
413}