001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hbase.tool;
020
021import static org.apache.hadoop.hbase.HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT;
022import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT;
023import java.io.Closeable;
024import java.io.IOException;
025import java.net.BindException;
026import java.net.InetSocketAddress;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.Collections;
030import java.util.EnumSet;
031import java.util.HashMap;
032import java.util.HashSet;
033import java.util.LinkedList;
034import java.util.List;
035import java.util.Map;
036import java.util.Random;
037import java.util.Set;
038import java.util.TreeSet;
039import java.util.concurrent.Callable;
040import java.util.concurrent.ConcurrentHashMap;
041import java.util.concurrent.ConcurrentMap;
042import java.util.concurrent.ExecutionException;
043import java.util.concurrent.ExecutorService;
044import java.util.concurrent.Future;
045import java.util.concurrent.ScheduledThreadPoolExecutor;
046import java.util.concurrent.ThreadLocalRandom;
047import java.util.concurrent.atomic.AtomicLong;
048import java.util.concurrent.atomic.LongAdder;
049import java.util.regex.Matcher;
050import java.util.regex.Pattern;
051import org.apache.commons.lang3.time.StopWatch;
052import org.apache.hadoop.conf.Configuration;
053import org.apache.hadoop.hbase.AuthUtil;
054import org.apache.hadoop.hbase.ChoreService;
055import org.apache.hadoop.hbase.ClusterMetrics;
056import org.apache.hadoop.hbase.ClusterMetrics.Option;
057import org.apache.hadoop.hbase.DoNotRetryIOException;
058import org.apache.hadoop.hbase.HBaseConfiguration;
059import org.apache.hadoop.hbase.HBaseInterfaceAudience;
060import org.apache.hadoop.hbase.HColumnDescriptor;
061import org.apache.hadoop.hbase.HConstants;
062import org.apache.hadoop.hbase.HRegionLocation;
063import org.apache.hadoop.hbase.HTableDescriptor;
064import org.apache.hadoop.hbase.MetaTableAccessor;
065import org.apache.hadoop.hbase.NamespaceDescriptor;
066import org.apache.hadoop.hbase.ScheduledChore;
067import org.apache.hadoop.hbase.ServerName;
068import org.apache.hadoop.hbase.TableName;
069import org.apache.hadoop.hbase.TableNotEnabledException;
070import org.apache.hadoop.hbase.TableNotFoundException;
071import org.apache.hadoop.hbase.client.Admin;
072import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
073import org.apache.hadoop.hbase.client.Connection;
074import org.apache.hadoop.hbase.client.ConnectionFactory;
075import org.apache.hadoop.hbase.client.Get;
076import org.apache.hadoop.hbase.client.Put;
077import org.apache.hadoop.hbase.client.RegionInfo;
078import org.apache.hadoop.hbase.client.RegionLocator;
079import org.apache.hadoop.hbase.client.ResultScanner;
080import org.apache.hadoop.hbase.client.Scan;
081import org.apache.hadoop.hbase.client.Table;
082import org.apache.hadoop.hbase.client.TableDescriptor;
083import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
084import org.apache.hadoop.hbase.http.InfoServer;
085import org.apache.hadoop.hbase.tool.CanaryTool.RegionTask.TaskType;
086import org.apache.hadoop.hbase.util.Bytes;
087import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
088import org.apache.hadoop.hbase.util.Pair;
089import org.apache.hadoop.hbase.util.ReflectionUtils;
090import org.apache.hadoop.hbase.util.RegionSplitter;
091import org.apache.hadoop.hbase.zookeeper.EmptyWatcher;
092import org.apache.hadoop.hbase.zookeeper.ZKConfig;
093import org.apache.hadoop.util.Tool;
094import org.apache.hadoop.util.ToolRunner;
095import org.apache.yetus.audience.InterfaceAudience;
096import org.apache.zookeeper.KeeperException;
097import org.apache.zookeeper.ZooKeeper;
098import org.apache.zookeeper.client.ConnectStringParser;
099import org.apache.zookeeper.data.Stat;
100import org.slf4j.Logger;
101import org.slf4j.LoggerFactory;
102
103import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
104
105/**
106 * HBase Canary Tool for "canary monitoring" of a running HBase cluster.
107 *
108 * There are three modes:
109 * <ol>
110 * <li>region mode (Default): For each region, try to get one row per column family outputting
111 * information on failure (ERROR) or else the latency.
112 * </li>
113 *
114 * <li>regionserver mode: For each regionserver try to get one row from one table selected
115 * randomly outputting information on failure (ERROR) or else the latency.
116 * </li>
117 *
118 * <li>zookeeper mode: for each zookeeper instance, selects a znode outputting information on
119 * failure (ERROR) or else the latency.
120 * </li>
121 * </ol>
122 */
123@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
124public class CanaryTool implements Tool, Canary {
125  public static final String HBASE_CANARY_INFO_PORT = "hbase.canary.info.port";
126  public static final String HBASE_CANARY_INFO_BINDADDRESS = "hbase.canary.info.bindAddress";
127
128  private void putUpWebUI() throws IOException {
129    int port = conf.getInt(HBASE_CANARY_INFO_PORT, -1);
130    // -1 is for disabling info server
131    if (port < 0) {
132      return;
133    }
134    if (zookeeperMode) {
135      LOG.info("WebUI is not supported in Zookeeper mode");
136    } else if (regionServerMode) {
137      LOG.info("WebUI is not supported in RegionServer mode");
138    } else {
139      String addr = conf.get(HBASE_CANARY_INFO_BINDADDRESS, "0.0.0.0");
140      try {
141        InfoServer infoServer = new InfoServer("canary", addr, port, false, conf);
142        infoServer.addUnprivilegedServlet("canary", "/canary-status", CanaryStatusServlet.class);
143        infoServer.setAttribute("sink", getSink(conf, RegionStdOutSink.class));
144        infoServer.start();
145        LOG.info("Bind Canary http info server to {}:{} ", addr, port);
146      } catch (BindException e) {
147        LOG.warn("Failed binding Canary http info server to {}:{}", addr, port, e);
148      }
149    }
150  }
151
152  @Override
153  public int checkRegions(String[] targets) throws Exception {
154    String configuredReadTableTimeoutsStr = conf.get(HBASE_CANARY_REGION_READ_TABLE_TIMEOUT);
155    try {
156      if (configuredReadTableTimeoutsStr != null) {
157        populateReadTableTimeoutsMap(configuredReadTableTimeoutsStr);
158      }
159    } catch (IllegalArgumentException e) {
160      LOG.error("Constructing read table timeouts map failed ", e);
161      return USAGE_EXIT_CODE;
162    }
163    return runMonitor(targets);
164  }
165
166  @Override
167  public int checkRegionServers(String[] targets) throws Exception {
168    regionServerMode = true;
169    return runMonitor(targets);
170  }
171
172  @Override
173  public int checkZooKeeper() throws Exception {
174    zookeeperMode = true;
175    return runMonitor(null);
176  }
177
178  /**
179   * Sink interface used by the canary to output information
180   */
181  public interface Sink {
182    long getReadFailureCount();
183    long incReadFailureCount();
184    Map<String,String> getReadFailures();
185    void updateReadFailures(String regionName, String serverName);
186    long getWriteFailureCount();
187    long incWriteFailureCount();
188    Map<String,String> getWriteFailures();
189    void updateWriteFailures(String regionName, String serverName);
190    long getReadSuccessCount();
191    long incReadSuccessCount();
192    long getWriteSuccessCount();
193    long incWriteSuccessCount();
194  }
195
196  /**
197   * Simple implementation of canary sink that allows plotting to a file or standard output.
198   */
199  public static class StdOutSink implements Sink {
200    private AtomicLong readFailureCount = new AtomicLong(0),
201        writeFailureCount = new AtomicLong(0),
202        readSuccessCount = new AtomicLong(0),
203        writeSuccessCount = new AtomicLong(0);
204    private Map<String, String> readFailures = new ConcurrentHashMap<>();
205    private Map<String, String> writeFailures = new ConcurrentHashMap<>();
206
207    @Override
208    public long getReadFailureCount() {
209      return readFailureCount.get();
210    }
211
212    @Override
213    public long incReadFailureCount() {
214      return readFailureCount.incrementAndGet();
215    }
216
217    @Override
218    public Map<String, String> getReadFailures() {
219      return readFailures;
220    }
221
222    @Override
223    public void updateReadFailures(String regionName, String serverName) {
224      readFailures.put(regionName, serverName);
225    }
226
227    @Override
228    public long getWriteFailureCount() {
229      return writeFailureCount.get();
230    }
231
232    @Override
233    public long incWriteFailureCount() {
234      return writeFailureCount.incrementAndGet();
235    }
236
237    @Override
238    public Map<String, String> getWriteFailures() {
239      return writeFailures;
240    }
241
242    @Override
243    public void updateWriteFailures(String regionName, String serverName) {
244      writeFailures.put(regionName, serverName);
245    }
246
247    @Override
248    public long getReadSuccessCount() {
249      return readSuccessCount.get();
250    }
251
252    @Override
253    public long incReadSuccessCount() {
254      return readSuccessCount.incrementAndGet();
255    }
256
257    @Override
258    public long getWriteSuccessCount() {
259      return writeSuccessCount.get();
260    }
261
262    @Override
263    public long incWriteSuccessCount() {
264      return writeSuccessCount.incrementAndGet();
265    }
266  }
267
268  /**
269   * By RegionServer, for 'regionserver' mode.
270   */
271  public static class RegionServerStdOutSink extends StdOutSink {
272    public void publishReadFailure(String table, String server) {
273      incReadFailureCount();
274      LOG.error("Read from {} on {}", table, server);
275    }
276
277    public void publishReadTiming(String table, String server, long msTime) {
278      LOG.info("Read from {} on {} in {}ms", table, server, msTime);
279    }
280  }
281
282  /**
283   * Output for 'zookeeper' mode.
284   */
285  public static class ZookeeperStdOutSink extends StdOutSink {
286    public void publishReadFailure(String znode, String server) {
287      incReadFailureCount();
288      LOG.error("Read from {} on {}", znode, server);
289    }
290
291    public void publishReadTiming(String znode, String server, long msTime) {
292      LOG.info("Read from {} on {} in {}ms", znode, server, msTime);
293    }
294  }
295
296  /**
297   * By Region, for 'region'  mode.
298   */
299  public static class RegionStdOutSink extends StdOutSink {
300    private Map<String, LongAdder> perTableReadLatency = new HashMap<>();
301    private LongAdder writeLatency = new LongAdder();
302    private final ConcurrentMap<String, List<RegionTaskResult>> regionMap =
303      new ConcurrentHashMap<>();
304    private ConcurrentMap<ServerName, LongAdder> perServerFailuresCount =
305      new ConcurrentHashMap<>();
306    private ConcurrentMap<String, LongAdder> perTableFailuresCount = new ConcurrentHashMap<>();
307
308    public ConcurrentMap<ServerName, LongAdder> getPerServerFailuresCount() {
309      return perServerFailuresCount;
310    }
311
312    public ConcurrentMap<String, LongAdder> getPerTableFailuresCount() {
313      return perTableFailuresCount;
314    }
315
316    public void resetFailuresCountDetails() {
317      perServerFailuresCount.clear();
318      perTableFailuresCount.clear();
319    }
320
321    private void incFailuresCountDetails(ServerName serverName, RegionInfo region) {
322      perServerFailuresCount.compute(serverName, (server, count) -> {
323        if (count == null) {
324          count = new LongAdder();
325        }
326        count.increment();
327        return count;
328      });
329      perTableFailuresCount.compute(region.getTable().getNameAsString(), (tableName, count) -> {
330        if (count == null) {
331          count = new LongAdder();
332        }
333        count.increment();
334        return count;
335      });
336    }
337
338    public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
339      incReadFailureCount();
340      incFailuresCountDetails(serverName, region);
341      LOG.error("Read from {} on serverName={} failed",
342          region.getRegionNameAsString(), serverName, e);
343    }
344
345    public void publishReadFailure(ServerName serverName, RegionInfo region,
346        ColumnFamilyDescriptor column, Exception e) {
347      incReadFailureCount();
348      incFailuresCountDetails(serverName, region);
349      LOG.error("Read from {} on serverName={}, columnFamily={} failed",
350          region.getRegionNameAsString(), serverName,
351          column.getNameAsString(), e);
352    }
353
354    public void publishReadTiming(ServerName serverName, RegionInfo region,
355        ColumnFamilyDescriptor column, long msTime) {
356      RegionTaskResult rtr = new RegionTaskResult(region, region.getTable(), serverName, column);
357      rtr.setReadSuccess();
358      rtr.setReadLatency(msTime);
359      List<RegionTaskResult> rtrs = regionMap.get(region.getRegionNameAsString());
360      rtrs.add(rtr);
361      // Note that read success count will be equal to total column family read successes.
362      incReadSuccessCount();
363      LOG.info("Read from {} on {} {} in {}ms", region.getRegionNameAsString(), serverName,
364          column.getNameAsString(), msTime);
365    }
366
367    public void publishWriteFailure(ServerName serverName, RegionInfo region, Exception e) {
368      incWriteFailureCount();
369      incFailuresCountDetails(serverName, region);
370      LOG.error("Write to {} on {} failed", region.getRegionNameAsString(), serverName, e);
371    }
372
373    public void publishWriteFailure(ServerName serverName, RegionInfo region,
374        ColumnFamilyDescriptor column, Exception e) {
375      incWriteFailureCount();
376      incFailuresCountDetails(serverName, region);
377      LOG.error("Write to {} on {} {} failed", region.getRegionNameAsString(), serverName,
378          column.getNameAsString(), e);
379    }
380
381    public void publishWriteTiming(ServerName serverName, RegionInfo region,
382        ColumnFamilyDescriptor column, long msTime) {
383      RegionTaskResult rtr = new RegionTaskResult(region, region.getTable(), serverName, column);
384      rtr.setWriteSuccess();
385      rtr.setWriteLatency(msTime);
386      List<RegionTaskResult> rtrs = regionMap.get(region.getRegionNameAsString());
387      rtrs.add(rtr);
388      // Note that write success count will be equal to total column family write successes.
389      incWriteSuccessCount();
390      LOG.info("Write to {} on {} {} in {}ms",
391        region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime);
392    }
393
394    public Map<String, LongAdder> getReadLatencyMap() {
395      return this.perTableReadLatency;
396    }
397
398    public LongAdder initializeAndGetReadLatencyForTable(String tableName) {
399      LongAdder initLatency = new LongAdder();
400      this.perTableReadLatency.put(tableName, initLatency);
401      return initLatency;
402    }
403
404    public void initializeWriteLatency() {
405      this.writeLatency.reset();
406    }
407
408    public LongAdder getWriteLatency() {
409      return this.writeLatency;
410    }
411
412    public ConcurrentMap<String, List<RegionTaskResult>> getRegionMap() {
413      return this.regionMap;
414    }
415
416    public int getTotalExpectedRegions() {
417      return this.regionMap.size();
418    }
419  }
420
421  /**
422   * Run a single zookeeper Task and then exit.
423   */
424  static class ZookeeperTask implements Callable<Void> {
425    private final Connection connection;
426    private final String host;
427    private String znode;
428    private final int timeout;
429    private ZookeeperStdOutSink sink;
430
431    public ZookeeperTask(Connection connection, String host, String znode, int timeout,
432        ZookeeperStdOutSink sink) {
433      this.connection = connection;
434      this.host = host;
435      this.znode = znode;
436      this.timeout = timeout;
437      this.sink = sink;
438    }
439
440    @Override public Void call() throws Exception {
441      ZooKeeper zooKeeper = null;
442      try {
443        zooKeeper = new ZooKeeper(host, timeout, EmptyWatcher.instance);
444        Stat exists = zooKeeper.exists(znode, false);
445        StopWatch stopwatch = new StopWatch();
446        stopwatch.start();
447        zooKeeper.getData(znode, false, exists);
448        stopwatch.stop();
449        sink.publishReadTiming(znode, host, stopwatch.getTime());
450      } catch (KeeperException | InterruptedException e) {
451        sink.publishReadFailure(znode, host);
452      } finally {
453        if (zooKeeper != null) {
454          zooKeeper.close();
455        }
456      }
457      return null;
458    }
459  }
460
461  /**
462   * Run a single Region Task and then exit. For each column family of the Region, get one row and
463   * output latency or failure.
464   */
465  static class RegionTask implements Callable<Void> {
466    public enum TaskType{
467      READ, WRITE
468    }
469    private Connection connection;
470    private RegionInfo region;
471    private RegionStdOutSink sink;
472    private TaskType taskType;
473    private boolean rawScanEnabled;
474    private ServerName serverName;
475    private LongAdder readWriteLatency;
476    private boolean readAllCF;
477
478    RegionTask(Connection connection, RegionInfo region, ServerName serverName,
479        RegionStdOutSink sink, TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency,
480        boolean readAllCF) {
481      this.connection = connection;
482      this.region = region;
483      this.serverName = serverName;
484      this.sink = sink;
485      this.taskType = taskType;
486      this.rawScanEnabled = rawScanEnabled;
487      this.readWriteLatency = rwLatency;
488      this.readAllCF = readAllCF;
489    }
490
491    @Override
492    public Void call() {
493      switch (taskType) {
494        case READ:
495          return read();
496        case WRITE:
497          return write();
498        default:
499          return read();
500      }
501    }
502
503    private Void readColumnFamily(Table table, ColumnFamilyDescriptor column) {
504      byte[] startKey = null;
505      Get get = null;
506      Scan scan = null;
507      ResultScanner rs = null;
508      StopWatch stopWatch = new StopWatch();
509      startKey = region.getStartKey();
510      // Can't do a get on empty start row so do a Scan of first element if any instead.
511      if (startKey.length > 0) {
512        get = new Get(startKey);
513        get.setCacheBlocks(false);
514        get.setFilter(new FirstKeyOnlyFilter());
515        get.addFamily(column.getName());
516      } else {
517        scan = new Scan();
518        LOG.debug("rawScan {} for {}", rawScanEnabled, region.getTable());
519        scan.setRaw(rawScanEnabled);
520        scan.setCaching(1);
521        scan.setCacheBlocks(false);
522        scan.setFilter(new FirstKeyOnlyFilter());
523        scan.addFamily(column.getName());
524        scan.setMaxResultSize(1L);
525        scan.setOneRowLimit();
526      }
527      LOG.debug("Reading from {} {} {} {}", region.getTable(), region.getRegionNameAsString(),
528        column.getNameAsString(), Bytes.toStringBinary(startKey));
529      try {
530        stopWatch.start();
531        if (startKey.length > 0) {
532          table.get(get);
533        } else {
534          rs = table.getScanner(scan);
535          rs.next();
536        }
537        stopWatch.stop();
538        this.readWriteLatency.add(stopWatch.getTime());
539        sink.publishReadTiming(serverName, region, column, stopWatch.getTime());
540      } catch (Exception e) {
541        sink.publishReadFailure(serverName, region, column, e);
542        sink.updateReadFailures(region.getRegionNameAsString(),
543          serverName == null ? "NULL" : serverName.getHostname());
544      } finally {
545        if (rs != null) {
546          rs.close();
547        }
548      }
549      return null;
550    }
551
552    private ColumnFamilyDescriptor randomPickOneColumnFamily(ColumnFamilyDescriptor[] cfs) {
553      int size = cfs.length;
554      return cfs[ThreadLocalRandom.current().nextInt(size)];
555
556    }
557
558    public Void read() {
559      Table table = null;
560      TableDescriptor tableDesc = null;
561      try {
562        LOG.debug("Reading table descriptor for table {}", region.getTable());
563        table = connection.getTable(region.getTable());
564        tableDesc = table.getDescriptor();
565      } catch (IOException e) {
566        LOG.debug("sniffRegion {} of {} failed", region.getEncodedName(), e);
567        sink.publishReadFailure(serverName, region, e);
568        if (table != null) {
569          try {
570            table.close();
571          } catch (IOException ioe) {
572            LOG.error("Close table failed", e);
573          }
574        }
575        return null;
576      }
577
578      if (readAllCF) {
579        for (ColumnFamilyDescriptor column : tableDesc.getColumnFamilies()) {
580          readColumnFamily(table, column);
581        }
582      } else {
583        readColumnFamily(table, randomPickOneColumnFamily(tableDesc.getColumnFamilies()));
584      }
585      try {
586        table.close();
587      } catch (IOException e) {
588        LOG.error("Close table failed", e);
589      }
590      return null;
591    }
592
593    /**
594     * Check writes for the canary table
595     */
596    private Void write() {
597      Table table = null;
598      TableDescriptor tableDesc = null;
599      try {
600        table = connection.getTable(region.getTable());
601        tableDesc = table.getDescriptor();
602        byte[] rowToCheck = region.getStartKey();
603        if (rowToCheck.length == 0) {
604          rowToCheck = new byte[]{0x0};
605        }
606        int writeValueSize =
607            connection.getConfiguration().getInt(HConstants.HBASE_CANARY_WRITE_VALUE_SIZE_KEY, 10);
608        for (ColumnFamilyDescriptor column : tableDesc.getColumnFamilies()) {
609          Put put = new Put(rowToCheck);
610          byte[] value = new byte[writeValueSize];
611          Bytes.random(value);
612          put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
613
614          LOG.debug("Writing to {} {} {} {}",
615            tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
616            Bytes.toStringBinary(rowToCheck));
617          try {
618            long startTime = System.currentTimeMillis();
619            table.put(put);
620            long time = System.currentTimeMillis() - startTime;
621            this.readWriteLatency.add(time);
622            sink.publishWriteTiming(serverName, region, column, time);
623          } catch (Exception e) {
624            sink.publishWriteFailure(serverName, region, column, e);
625          }
626        }
627        table.close();
628      } catch (IOException e) {
629        sink.publishWriteFailure(serverName, region, e);
630        sink.updateWriteFailures(region.getRegionNameAsString(), serverName.getHostname());
631      }
632      return null;
633    }
634  }
635
636  /**
637   * Run a single RegionServer Task and then exit.
638   * Get one row from a region on the regionserver and output latency or the failure.
639   */
640  static class RegionServerTask implements Callable<Void> {
641    private Connection connection;
642    private String serverName;
643    private RegionInfo region;
644    private RegionServerStdOutSink sink;
645    private AtomicLong successes;
646
647    RegionServerTask(Connection connection, String serverName, RegionInfo region,
648        RegionServerStdOutSink sink, AtomicLong successes) {
649      this.connection = connection;
650      this.serverName = serverName;
651      this.region = region;
652      this.sink = sink;
653      this.successes = successes;
654    }
655
656    @Override
657    public Void call() {
658      TableName tableName = null;
659      Table table = null;
660      Get get = null;
661      byte[] startKey = null;
662      Scan scan = null;
663      StopWatch stopWatch = new StopWatch();
664      // monitor one region on every region server
665      stopWatch.reset();
666      try {
667        tableName = region.getTable();
668        table = connection.getTable(tableName);
669        startKey = region.getStartKey();
670        // Can't do a get on empty start row so do a Scan of first element if any instead.
671        LOG.debug("Reading from {} {} {} {}",
672          serverName, region.getTable(), region.getRegionNameAsString(),
673          Bytes.toStringBinary(startKey));
674        if (startKey.length > 0) {
675          get = new Get(startKey);
676          get.setCacheBlocks(false);
677          get.setFilter(new FirstKeyOnlyFilter());
678          stopWatch.start();
679          table.get(get);
680          stopWatch.stop();
681        } else {
682          scan = new Scan();
683          scan.setCacheBlocks(false);
684          scan.setFilter(new FirstKeyOnlyFilter());
685          scan.setCaching(1);
686          scan.setMaxResultSize(1L);
687          scan.setOneRowLimit();
688          stopWatch.start();
689          ResultScanner s = table.getScanner(scan);
690          s.next();
691          s.close();
692          stopWatch.stop();
693        }
694        successes.incrementAndGet();
695        sink.publishReadTiming(tableName.getNameAsString(), serverName, stopWatch.getTime());
696      } catch (TableNotFoundException tnfe) {
697        LOG.error("Table may be deleted", tnfe);
698        // This is ignored because it doesn't imply that the regionserver is dead
699      } catch (TableNotEnabledException tnee) {
700        // This is considered a success since we got a response.
701        successes.incrementAndGet();
702        LOG.debug("The targeted table was disabled.  Assuming success.");
703      } catch (DoNotRetryIOException dnrioe) {
704        sink.publishReadFailure(tableName.getNameAsString(), serverName);
705        LOG.error(dnrioe.toString(), dnrioe);
706      } catch (IOException e) {
707        sink.publishReadFailure(tableName.getNameAsString(), serverName);
708        LOG.error(e.toString(), e);
709      } finally {
710        if (table != null) {
711          try {
712            table.close();
713          } catch (IOException e) {/* DO NOTHING */
714            LOG.error("Close table failed", e);
715          }
716        }
717        scan = null;
718        get = null;
719        startKey = null;
720      }
721      return null;
722    }
723  }
724
725  private static final int USAGE_EXIT_CODE = 1;
726  private static final int INIT_ERROR_EXIT_CODE = 2;
727  private static final int TIMEOUT_ERROR_EXIT_CODE = 3;
728  private static final int ERROR_EXIT_CODE = 4;
729  private static final int FAILURE_EXIT_CODE = 5;
730
731  private static final long DEFAULT_INTERVAL = 60000;
732
733  private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
734  private static final int MAX_THREADS_NUM = 16; // #threads to contact regions
735
736  private static final Logger LOG = LoggerFactory.getLogger(Canary.class);
737
738  public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
739    NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
740
741  private static final String CANARY_TABLE_FAMILY_NAME = "Test";
742
743  private Configuration conf = null;
744  private long interval = 0;
745  private Sink sink = null;
746
747  /**
748   * True if we are to run in 'regionServer' mode.
749   */
750  private boolean regionServerMode = false;
751
752  /**
753   * True if we are to run in zookeeper 'mode'.
754   */
755  private boolean zookeeperMode = false;
756
757  /**
758   * This is a Map of table to timeout. The timeout is for reading all regions in the table; i.e.
759   * we aggregate time to fetch each region and it needs to be less than this value else we
760   * log an ERROR.
761   */
762  private HashMap<String, Long> configuredReadTableTimeouts = new HashMap<>();
763
764  public static final String HBASE_CANARY_REGIONSERVER_ALL_REGIONS
765          = "hbase.canary.regionserver_all_regions";
766
767  public static final String HBASE_CANARY_REGION_WRITE_SNIFFING
768          = "hbase.canary.region.write.sniffing";
769  public static final String HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT
770          = "hbase.canary.region.write.table.timeout";
771  public static final String HBASE_CANARY_REGION_WRITE_TABLE_NAME
772          = "hbase.canary.region.write.table.name";
773  public static final String HBASE_CANARY_REGION_READ_TABLE_TIMEOUT
774          = "hbase.canary.region.read.table.timeout";
775
776  public static final String HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES
777          = "hbase.canary.zookeeper.permitted.failures";
778
779  public static final String HBASE_CANARY_USE_REGEX = "hbase.canary.use.regex";
780  public static final String HBASE_CANARY_TIMEOUT = "hbase.canary.timeout";
781  public static final String HBASE_CANARY_FAIL_ON_ERROR = "hbase.canary.fail.on.error";
782
783
784  private ExecutorService executor; // threads to retrieve data from regionservers
785
786  public CanaryTool() {
787    this(new ScheduledThreadPoolExecutor(1));
788  }
789
790  public CanaryTool(ExecutorService executor) {
791    this(executor, null);
792  }
793
794  @InterfaceAudience.Private
795  CanaryTool(ExecutorService executor, Sink sink) {
796    this.executor = executor;
797    this.sink = sink;
798  }
799
800  CanaryTool(Configuration conf, ExecutorService executor) {
801    this(conf, executor, null);
802  }
803
804  CanaryTool(Configuration conf, ExecutorService executor, Sink sink) {
805    this(executor, sink);
806    setConf(conf);
807  }
808
809  @Override
810  public Configuration getConf() {
811    return conf;
812  }
813
814  @Override
815  public void setConf(Configuration conf) {
816    if (conf == null) {
817      conf = HBaseConfiguration.create();
818    }
819    this.conf = conf;
820  }
821
822  private int parseArgs(String[] args) {
823    int index = -1;
824    long permittedFailures = 0;
825    boolean regionServerAllRegions = false, writeSniffing = false;
826    String readTableTimeoutsStr = null;
827    // Process command line args
828    for (int i = 0; i < args.length; i++) {
829      String cmd = args[i];
830      if (cmd.startsWith("-")) {
831        if (index >= 0) {
832          // command line args must be in the form: [opts] [table 1 [table 2 ...]]
833          System.err.println("Invalid command line options");
834          printUsageAndExit();
835        }
836        if (cmd.equals("-help") || cmd.equals("-h")) {
837          // user asked for help, print the help and quit.
838          printUsageAndExit();
839        } else if (cmd.equals("-daemon") && interval == 0) {
840          // user asked for daemon mode, set a default interval between checks
841          interval = DEFAULT_INTERVAL;
842        } else if (cmd.equals("-interval")) {
843          // user has specified an interval for canary breaths (-interval N)
844          i++;
845
846          if (i == args.length) {
847            System.err.println("-interval takes a numeric seconds value argument.");
848            printUsageAndExit();
849          }
850          try {
851            interval = Long.parseLong(args[i]) * 1000;
852          } catch (NumberFormatException e) {
853            System.err.println("-interval needs a numeric value argument.");
854            printUsageAndExit();
855          }
856        } else if (cmd.equals("-zookeeper")) {
857          this.zookeeperMode = true;
858        } else if(cmd.equals("-regionserver")) {
859          this.regionServerMode = true;
860        } else if(cmd.equals("-allRegions")) {
861          conf.setBoolean(HBASE_CANARY_REGIONSERVER_ALL_REGIONS, true);
862          regionServerAllRegions = true;
863        } else if(cmd.equals("-writeSniffing")) {
864          writeSniffing = true;
865          conf.setBoolean(HBASE_CANARY_REGION_WRITE_SNIFFING, true);
866        } else if(cmd.equals("-treatFailureAsError") || cmd.equals("-failureAsError")) {
867          conf.setBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
868        } else if (cmd.equals("-e")) {
869          conf.setBoolean(HBASE_CANARY_USE_REGEX, true);
870        } else if (cmd.equals("-t")) {
871          i++;
872
873          if (i == args.length) {
874            System.err.println("-t takes a numeric milliseconds value argument.");
875            printUsageAndExit();
876          }
877          long timeout = 0;
878          try {
879            timeout = Long.parseLong(args[i]);
880          } catch (NumberFormatException e) {
881            System.err.println("-t takes a numeric milliseconds value argument.");
882            printUsageAndExit();
883          }
884          conf.setLong(HBASE_CANARY_TIMEOUT, timeout);
885        } else if(cmd.equals("-writeTableTimeout")) {
886          i++;
887
888          if (i == args.length) {
889            System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
890            printUsageAndExit();
891          }
892          long configuredWriteTableTimeout = 0;
893          try {
894            configuredWriteTableTimeout = Long.parseLong(args[i]);
895          } catch (NumberFormatException e) {
896            System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
897            printUsageAndExit();
898          }
899          conf.setLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, configuredWriteTableTimeout);
900        } else if (cmd.equals("-writeTable")) {
901          i++;
902
903          if (i == args.length) {
904            System.err.println("-writeTable takes a string tablename value argument.");
905            printUsageAndExit();
906          }
907          conf.set(HBASE_CANARY_REGION_WRITE_TABLE_NAME, args[i]);
908        } else if (cmd.equals("-f")) {
909          i++;
910          if (i == args.length) {
911            System.err
912                .println("-f needs a boolean value argument (true|false).");
913            printUsageAndExit();
914          }
915
916          conf.setBoolean(HBASE_CANARY_FAIL_ON_ERROR, Boolean.parseBoolean(args[i]));
917        } else if (cmd.equals("-readTableTimeouts")) {
918          i++;
919          if (i == args.length) {
920            System.err.println("-readTableTimeouts needs a comma-separated list of read " +
921                "millisecond timeouts per table (without spaces).");
922            printUsageAndExit();
923          }
924          readTableTimeoutsStr = args[i];
925          conf.set(HBASE_CANARY_REGION_READ_TABLE_TIMEOUT, readTableTimeoutsStr);
926        } else if (cmd.equals("-permittedZookeeperFailures")) {
927          i++;
928
929          if (i == args.length) {
930            System.err.println("-permittedZookeeperFailures needs a numeric value argument.");
931            printUsageAndExit();
932          }
933          try {
934            permittedFailures = Long.parseLong(args[i]);
935          } catch (NumberFormatException e) {
936            System.err.println("-permittedZookeeperFailures needs a numeric value argument.");
937            printUsageAndExit();
938          }
939          conf.setLong(HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES, permittedFailures);
940        } else {
941          // no options match
942          System.err.println(cmd + " options is invalid.");
943          printUsageAndExit();
944        }
945      } else if (index < 0) {
946        // keep track of first table name specified by the user
947        index = i;
948      }
949    }
950    if (regionServerAllRegions && !this.regionServerMode) {
951      System.err.println("-allRegions can only be specified in regionserver mode.");
952      printUsageAndExit();
953    }
954    if (this.zookeeperMode) {
955      if (this.regionServerMode || regionServerAllRegions || writeSniffing) {
956        System.err.println("-zookeeper is exclusive and cannot be combined with "
957            + "other modes.");
958        printUsageAndExit();
959      }
960    }
961    if (permittedFailures != 0 && !this.zookeeperMode) {
962      System.err.println("-permittedZookeeperFailures requires -zookeeper mode.");
963      printUsageAndExit();
964    }
965    if (readTableTimeoutsStr != null && (this.regionServerMode || this.zookeeperMode)) {
966      System.err.println("-readTableTimeouts can only be configured in region mode.");
967      printUsageAndExit();
968    }
969    return index;
970  }
971
972  @Override
973  public int run(String[] args) throws Exception {
974    int index = parseArgs(args);
975    String[] monitorTargets = null;
976
977    if (index >= 0) {
978      int length = args.length - index;
979      monitorTargets = new String[length];
980      System.arraycopy(args, index, monitorTargets, 0, length);
981    }
982    if (interval > 0) {
983      //Only show the web page in daemon mode
984      putUpWebUI();
985    }
986    if (zookeeperMode) {
987      return checkZooKeeper();
988    } else if (regionServerMode) {
989      return checkRegionServers(monitorTargets);
990    } else {
991      return checkRegions(monitorTargets);
992    }
993  }
994
995  private int runMonitor(String[] monitorTargets) throws Exception {
996    ChoreService choreService = null;
997
998    // Launches chore for refreshing kerberos credentials if security is enabled.
999    // Please see http://hbase.apache.org/book.html#_running_canary_in_a_kerberos_enabled_cluster
1000    // for more details.
1001    final ScheduledChore authChore = AuthUtil.getAuthChore(conf);
1002    if (authChore != null) {
1003      choreService = new ChoreService("CANARY_TOOL");
1004      choreService.scheduleChore(authChore);
1005    }
1006
1007    // Start to prepare the stuffs
1008    Monitor monitor = null;
1009    Thread monitorThread;
1010    long startTime = 0;
1011    long currentTimeLength = 0;
1012    boolean failOnError = conf.getBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
1013    long timeout = conf.getLong(HBASE_CANARY_TIMEOUT, DEFAULT_TIMEOUT);
1014    // Get a connection to use in below.
1015    try (Connection connection = ConnectionFactory.createConnection(this.conf)) {
1016      do {
1017        // Do monitor !!
1018        try {
1019          monitor = this.newMonitor(connection, monitorTargets);
1020          monitorThread = new Thread(monitor, "CanaryMonitor-" + System.currentTimeMillis());
1021          startTime = System.currentTimeMillis();
1022          monitorThread.start();
1023          while (!monitor.isDone()) {
1024            // wait for 1 sec
1025            Thread.sleep(1000);
1026            // exit if any error occurs
1027            if (failOnError && monitor.hasError()) {
1028              monitorThread.interrupt();
1029              if (monitor.initialized) {
1030                return monitor.errorCode;
1031              } else {
1032                return INIT_ERROR_EXIT_CODE;
1033              }
1034            }
1035            currentTimeLength = System.currentTimeMillis() - startTime;
1036            if (currentTimeLength > timeout) {
1037              LOG.error("The monitor is running too long (" + currentTimeLength
1038                  + ") after timeout limit:" + timeout
1039                  + " will be killed itself !!");
1040              if (monitor.initialized) {
1041                return TIMEOUT_ERROR_EXIT_CODE;
1042              } else {
1043                return INIT_ERROR_EXIT_CODE;
1044              }
1045            }
1046          }
1047
1048          if (failOnError && monitor.finalCheckForErrors()) {
1049            monitorThread.interrupt();
1050            return monitor.errorCode;
1051          }
1052        } finally {
1053          if (monitor != null) {
1054            monitor.close();
1055          }
1056        }
1057
1058        Thread.sleep(interval);
1059      } while (interval > 0);
1060    } // try-with-resources close
1061
1062    if (choreService != null) {
1063      choreService.shutdown();
1064    }
1065    return monitor.errorCode;
1066  }
1067
1068  @Override
1069  public Map<String, String> getReadFailures()  {
1070    return sink.getReadFailures();
1071  }
1072
1073  @Override
1074  public Map<String, String> getWriteFailures()  {
1075    return sink.getWriteFailures();
1076  }
1077
1078  private void printUsageAndExit() {
1079    System.err.println(
1080      "Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]");
1081    System.err.println("Where [OPTIONS] are:");
1082    System.err.println(" -h,-help        show this help and exit.");
1083    System.err.println(" -regionserver   set 'regionserver mode'; gets row from random region on " +
1084        "server");
1085    System.err.println(" -allRegions     get from ALL regions when 'regionserver mode', not just " +
1086        "random one.");
1087    System.err.println(" -zookeeper      set 'zookeeper mode'; grab zookeeper.znode.parent on " +
1088        "each ensemble member");
1089    System.err.println(" -daemon         continuous check at defined intervals.");
1090    System.err.println(" -interval <N>   interval between checks in seconds");
1091    System.err.println(" -e              consider table/regionserver argument as regular " +
1092        "expression");
1093    System.err.println(" -f <B>          exit on first error; default=true");
1094    System.err.println(" -failureAsError treat read/write failure as error");
1095    System.err.println(" -t <N>          timeout for canary-test run; default=600000ms");
1096    System.err.println(" -writeSniffing  enable write sniffing");
1097    System.err.println(" -writeTable     the table used for write sniffing; default=hbase:canary");
1098    System.err.println(" -writeTableTimeout <N>  timeout for writeTable; default=600000ms");
1099    System.err.println(" -readTableTimeouts <tableName>=<read timeout>," +
1100        "<tableName>=<read timeout>,...");
1101    System.err.println("                comma-separated list of table read timeouts " +
1102        "(no spaces);");
1103    System.err.println("                logs 'ERROR' if takes longer. default=600000ms");
1104    System.err.println(" -permittedZookeeperFailures <N>  Ignore first N failures attempting to ");
1105    System.err.println("                connect to individual zookeeper nodes in ensemble");
1106    System.err.println("");
1107    System.err.println(" -D<configProperty>=<value> to assign or override configuration params");
1108    System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Set to enable/disable " +
1109        "raw scan; default=false");
1110    System.err.println(" -Dhbase.canary.info.port=PORT_NUMBER  Set for a Canary UI; " +
1111      "default=-1 (None)");
1112    System.err.println("");
1113    System.err.println("Canary runs in one of three modes: region (default), regionserver, or " +
1114        "zookeeper.");
1115    System.err.println("To sniff/probe all regions, pass no arguments.");
1116    System.err.println("To sniff/probe all regions of a table, pass tablename.");
1117    System.err.println("To sniff/probe regionservers, pass -regionserver, etc.");
1118    System.err.println("See http://hbase.apache.org/book.html#_canary for Canary documentation.");
1119    System.exit(USAGE_EXIT_CODE);
1120  }
1121
1122  Sink getSink(Configuration configuration, Class clazz) {
1123    // In test context, this.sink might be set. Use it if non-null. For testing.
1124    return this.sink != null? this.sink:
1125        (Sink)ReflectionUtils.newInstance(configuration.getClass("hbase.canary.sink.class",
1126            clazz, Sink.class));
1127  }
1128
1129  /**
1130   * Canary region mode-specific data structure which stores information about each region
1131   * to be scanned
1132   */
1133  public static class RegionTaskResult {
1134    private RegionInfo region;
1135    private TableName tableName;
1136    private ServerName serverName;
1137    private ColumnFamilyDescriptor column;
1138    private AtomicLong readLatency = null;
1139    private AtomicLong writeLatency = null;
1140    private boolean readSuccess = false;
1141    private boolean writeSuccess = false;
1142
1143    public RegionTaskResult(RegionInfo region, TableName tableName, ServerName serverName,
1144        ColumnFamilyDescriptor column) {
1145      this.region = region;
1146      this.tableName = tableName;
1147      this.serverName = serverName;
1148      this.column = column;
1149    }
1150
1151    public RegionInfo getRegionInfo() {
1152      return this.region;
1153    }
1154
1155    public String getRegionNameAsString() {
1156      return this.region.getRegionNameAsString();
1157    }
1158
1159    public TableName getTableName() {
1160      return this.tableName;
1161    }
1162
1163    public String getTableNameAsString() {
1164      return this.tableName.getNameAsString();
1165    }
1166
1167    public ServerName getServerName() {
1168      return this.serverName;
1169    }
1170
1171    public String getServerNameAsString() {
1172      return this.serverName.getServerName();
1173    }
1174
1175    public ColumnFamilyDescriptor getColumnFamily() {
1176      return this.column;
1177    }
1178
1179    public String getColumnFamilyNameAsString() {
1180      return this.column.getNameAsString();
1181    }
1182
1183    public long getReadLatency() {
1184      if (this.readLatency == null) {
1185        return -1;
1186      }
1187      return this.readLatency.get();
1188    }
1189
1190    public void setReadLatency(long readLatency) {
1191      if (this.readLatency != null) {
1192        this.readLatency.set(readLatency);
1193      } else {
1194        this.readLatency = new AtomicLong(readLatency);
1195      }
1196    }
1197
1198    public long getWriteLatency() {
1199      if (this.writeLatency == null) {
1200        return -1;
1201      }
1202      return this.writeLatency.get();
1203    }
1204
1205    public void setWriteLatency(long writeLatency) {
1206      if (this.writeLatency != null) {
1207        this.writeLatency.set(writeLatency);
1208      } else {
1209        this.writeLatency = new AtomicLong(writeLatency);
1210      }
1211    }
1212
1213    public boolean isReadSuccess() {
1214      return this.readSuccess;
1215    }
1216
1217    public void setReadSuccess() {
1218      this.readSuccess = true;
1219    }
1220
1221    public boolean isWriteSuccess() {
1222      return this.writeSuccess;
1223    }
1224
1225    public void setWriteSuccess() {
1226      this.writeSuccess = true;
1227    }
1228  }
1229
1230  /**
1231   * A Factory method for {@link Monitor}.
1232   * Makes a RegionServerMonitor, or a ZooKeeperMonitor, or a RegionMonitor.
1233   * @return a Monitor instance
1234   */
1235  private Monitor newMonitor(final Connection connection, String[] monitorTargets) {
1236    Monitor monitor;
1237    boolean useRegExp = conf.getBoolean(HBASE_CANARY_USE_REGEX, false);
1238    boolean regionServerAllRegions
1239            = conf.getBoolean(HBASE_CANARY_REGIONSERVER_ALL_REGIONS, false);
1240    boolean failOnError
1241            = conf.getBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
1242    int permittedFailures
1243            = conf.getInt(HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES, 0);
1244    boolean writeSniffing
1245            = conf.getBoolean(HBASE_CANARY_REGION_WRITE_SNIFFING, false);
1246    String writeTableName = conf.get(HBASE_CANARY_REGION_WRITE_TABLE_NAME,
1247            DEFAULT_WRITE_TABLE_NAME.getNameAsString());
1248    long configuredWriteTableTimeout
1249            = conf.getLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, DEFAULT_TIMEOUT);
1250
1251    if (this.regionServerMode) {
1252      monitor =
1253          new RegionServerMonitor(connection, monitorTargets, useRegExp,
1254              getSink(connection.getConfiguration(), RegionServerStdOutSink.class),
1255              this.executor, regionServerAllRegions,
1256              failOnError, permittedFailures);
1257
1258    } else if (this.zookeeperMode) {
1259      monitor =
1260          new ZookeeperMonitor(connection, monitorTargets, useRegExp,
1261              getSink(connection.getConfiguration(), ZookeeperStdOutSink.class),
1262              this.executor, failOnError, permittedFailures);
1263    } else {
1264      monitor =
1265          new RegionMonitor(connection, monitorTargets, useRegExp,
1266              getSink(connection.getConfiguration(), RegionStdOutSink.class),
1267              this.executor, writeSniffing,
1268              TableName.valueOf(writeTableName), failOnError, configuredReadTableTimeouts,
1269              configuredWriteTableTimeout, permittedFailures);
1270    }
1271    return monitor;
1272  }
1273
1274  private void populateReadTableTimeoutsMap(String configuredReadTableTimeoutsStr) {
1275    String[] tableTimeouts = configuredReadTableTimeoutsStr.split(",");
1276    for (String tT : tableTimeouts) {
1277      String[] nameTimeout = tT.split("=");
1278      if (nameTimeout.length < 2) {
1279        throw new IllegalArgumentException("Each -readTableTimeouts argument must be of the form " +
1280            "<tableName>=<read timeout> (without spaces).");
1281      }
1282      long timeoutVal;
1283      try {
1284        timeoutVal = Long.parseLong(nameTimeout[1]);
1285      } catch (NumberFormatException e) {
1286        throw new IllegalArgumentException("-readTableTimeouts read timeout for each table" +
1287            " must be a numeric value argument.");
1288      }
1289      configuredReadTableTimeouts.put(nameTimeout[0], timeoutVal);
1290    }
1291  }
1292  /**
1293   * A Monitor super-class can be extended by users
1294   */
1295  public static abstract class Monitor implements Runnable, Closeable {
1296    protected Connection connection;
1297    protected Admin admin;
1298    /**
1299     * 'Target' dependent on 'mode'. Could be Tables or RegionServers or ZNodes.
1300     * Passed on the command-line as arguments.
1301     */
1302    protected String[] targets;
1303    protected boolean useRegExp;
1304    protected boolean treatFailureAsError;
1305    protected boolean initialized = false;
1306
1307    protected boolean done = false;
1308    protected int errorCode = 0;
1309    protected long allowedFailures = 0;
1310    protected Sink sink;
1311    protected ExecutorService executor;
1312
1313    public boolean isDone() {
1314      return done;
1315    }
1316
1317    public boolean hasError() {
1318      return errorCode != 0;
1319    }
1320
1321    public boolean finalCheckForErrors() {
1322      if (errorCode != 0) {
1323        return true;
1324      }
1325      if (treatFailureAsError && (sink.getReadFailureCount() > allowedFailures
1326          || sink.getWriteFailureCount() > allowedFailures)) {
1327        LOG.error("Too many failures detected, treating failure as error, failing the Canary.");
1328        errorCode = FAILURE_EXIT_CODE;
1329        return true;
1330      }
1331      return false;
1332    }
1333
1334    @Override
1335    public void close() throws IOException {
1336      if (this.admin != null) {
1337        this.admin.close();
1338      }
1339    }
1340
1341    protected Monitor(Connection connection, String[] monitorTargets, boolean useRegExp, Sink sink,
1342        ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
1343      if (null == connection) {
1344        throw new IllegalArgumentException("connection shall not be null");
1345      }
1346
1347      this.connection = connection;
1348      this.targets = monitorTargets;
1349      this.useRegExp = useRegExp;
1350      this.treatFailureAsError = treatFailureAsError;
1351      this.sink = sink;
1352      this.executor = executor;
1353      this.allowedFailures = allowedFailures;
1354    }
1355
1356    @Override
1357    public abstract void run();
1358
1359    protected boolean initAdmin() {
1360      if (null == this.admin) {
1361        try {
1362          this.admin = this.connection.getAdmin();
1363        } catch (Exception e) {
1364          LOG.error("Initial HBaseAdmin failed...", e);
1365          this.errorCode = INIT_ERROR_EXIT_CODE;
1366        }
1367      } else if (admin.isAborted()) {
1368        LOG.error("HBaseAdmin aborted");
1369        this.errorCode = INIT_ERROR_EXIT_CODE;
1370      }
1371      return !this.hasError();
1372    }
1373  }
1374
1375  /**
1376   * A monitor for region mode.
1377   */
1378  private static class RegionMonitor extends Monitor {
1379    // 10 minutes
1380    private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
1381    // 1 days
1382    private static final int DEFAULT_WRITE_DATA_TTL = 24 * 60 * 60;
1383
1384    private long lastCheckTime = -1;
1385    private boolean writeSniffing;
1386    private TableName writeTableName;
1387    private int writeDataTTL;
1388    private float regionsLowerLimit;
1389    private float regionsUpperLimit;
1390    private int checkPeriod;
1391    private boolean rawScanEnabled;
1392    private boolean readAllCF;
1393
1394    /**
1395     * This is a timeout per table. If read of each region in the table aggregated takes longer
1396     * than what is configured here, we log an ERROR rather than just an INFO.
1397     */
1398    private HashMap<String, Long> configuredReadTableTimeouts;
1399
1400    private long configuredWriteTableTimeout;
1401
1402    public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
1403        Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
1404        boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts,
1405        long configuredWriteTableTimeout,
1406        long allowedFailures) {
1407      super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
1408          allowedFailures);
1409      Configuration conf = connection.getConfiguration();
1410      this.writeSniffing = writeSniffing;
1411      this.writeTableName = writeTableName;
1412      this.writeDataTTL =
1413          conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL);
1414      this.regionsLowerLimit =
1415          conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY, 1.0f);
1416      this.regionsUpperLimit =
1417          conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY, 1.5f);
1418      this.checkPeriod =
1419          conf.getInt(HConstants.HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY,
1420            DEFAULT_WRITE_TABLE_CHECK_PERIOD);
1421      this.rawScanEnabled = conf.getBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, false);
1422      this.configuredReadTableTimeouts = new HashMap<>(configuredReadTableTimeouts);
1423      this.configuredWriteTableTimeout = configuredWriteTableTimeout;
1424      this.readAllCF = conf.getBoolean(HConstants.HBASE_CANARY_READ_ALL_CF, true);
1425    }
1426
1427    private RegionStdOutSink getSink() {
1428      if (!(sink instanceof RegionStdOutSink)) {
1429        throw new RuntimeException("Can only write to Region sink");
1430      }
1431      return ((RegionStdOutSink) sink);
1432    }
1433
1434    @Override
1435    public void run() {
1436      if (this.initAdmin()) {
1437        try {
1438          List<Future<Void>> taskFutures = new LinkedList<>();
1439          RegionStdOutSink regionSink = this.getSink();
1440          regionSink.resetFailuresCountDetails();
1441          if (this.targets != null && this.targets.length > 0) {
1442            String[] tables = generateMonitorTables(this.targets);
1443            // Check to see that each table name passed in the -readTableTimeouts argument is also
1444            // passed as a monitor target.
1445            if (!new HashSet<>(Arrays.asList(tables)).
1446                containsAll(this.configuredReadTableTimeouts.keySet())) {
1447              LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets " +
1448                  "passed via command line.");
1449              this.errorCode = USAGE_EXIT_CODE;
1450              return;
1451            }
1452            this.initialized = true;
1453            for (String table : tables) {
1454              LongAdder readLatency = regionSink.initializeAndGetReadLatencyForTable(table);
1455              taskFutures.addAll(CanaryTool.sniff(admin, regionSink, table, executor, TaskType.READ,
1456                this.rawScanEnabled, readLatency, readAllCF));
1457            }
1458          } else {
1459            taskFutures.addAll(sniff(TaskType.READ, regionSink));
1460          }
1461
1462          if (writeSniffing) {
1463            if (EnvironmentEdgeManager.currentTime() - lastCheckTime > checkPeriod) {
1464              try {
1465                checkWriteTableDistribution();
1466              } catch (IOException e) {
1467                LOG.error("Check canary table distribution failed!", e);
1468              }
1469              lastCheckTime = EnvironmentEdgeManager.currentTime();
1470            }
1471            // sniff canary table with write operation
1472            regionSink.initializeWriteLatency();
1473            LongAdder writeTableLatency = regionSink.getWriteLatency();
1474            taskFutures
1475                .addAll(CanaryTool.sniff(admin, regionSink, admin.getDescriptor(writeTableName),
1476                  executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency, readAllCF));
1477          }
1478
1479          for (Future<Void> future : taskFutures) {
1480            try {
1481              future.get();
1482            } catch (ExecutionException e) {
1483              LOG.error("Sniff region failed!", e);
1484            }
1485          }
1486          Map<String, LongAdder> actualReadTableLatency = regionSink.getReadLatencyMap();
1487          for (Map.Entry<String, Long> entry : configuredReadTableTimeouts.entrySet()) {
1488            String tableName = entry.getKey();
1489            if (actualReadTableLatency.containsKey(tableName)) {
1490              Long actual = actualReadTableLatency.get(tableName).longValue();
1491              Long configured = entry.getValue();
1492              if (actual > configured) {
1493                LOG.error("Read operation for {} took {}ms exceeded the configured read timeout." +
1494                    "(Configured read timeout {}ms.", tableName, actual, configured);
1495              } else {
1496                LOG.info("Read operation for {} took {}ms (Configured read timeout {}ms.",
1497                    tableName, actual, configured);
1498              }
1499            } else {
1500              LOG.error("Read operation for {} failed!", tableName);
1501            }
1502          }
1503          if (this.writeSniffing) {
1504            String writeTableStringName = this.writeTableName.getNameAsString();
1505            long actualWriteLatency = regionSink.getWriteLatency().longValue();
1506            LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.",
1507                writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout);
1508            // Check that the writeTable write operation latency does not exceed the configured
1509            // timeout.
1510            if (actualWriteLatency > this.configuredWriteTableTimeout) {
1511              LOG.error("Write operation for {} exceeded the configured write timeout.",
1512                  writeTableStringName);
1513            }
1514          }
1515        } catch (Exception e) {
1516          LOG.error("Run regionMonitor failed", e);
1517          this.errorCode = ERROR_EXIT_CODE;
1518        } finally {
1519          this.done = true;
1520        }
1521      }
1522      this.done = true;
1523    }
1524
1525    /**
1526     * @return List of tables to use in test.
1527     */
1528    private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
1529      String[] returnTables = null;
1530
1531      if (this.useRegExp) {
1532        Pattern pattern = null;
1533        List<TableDescriptor> tds = null;
1534        Set<String> tmpTables = new TreeSet<>();
1535        try {
1536          LOG.debug(String.format("reading list of tables"));
1537          tds = this.admin.listTableDescriptors(pattern);
1538          if (tds == null) {
1539            tds = Collections.emptyList();
1540          }
1541          for (String monitorTarget : monitorTargets) {
1542            pattern = Pattern.compile(monitorTarget);
1543            for (TableDescriptor td : tds) {
1544              if (pattern.matcher(td.getTableName().getNameAsString()).matches()) {
1545                tmpTables.add(td.getTableName().getNameAsString());
1546              }
1547            }
1548          }
1549        } catch (IOException e) {
1550          LOG.error("Communicate with admin failed", e);
1551          throw e;
1552        }
1553
1554        if (tmpTables.size() > 0) {
1555          returnTables = tmpTables.toArray(new String[tmpTables.size()]);
1556        } else {
1557          String msg = "No HTable found, tablePattern:" + Arrays.toString(monitorTargets);
1558          LOG.error(msg);
1559          this.errorCode = INIT_ERROR_EXIT_CODE;
1560          throw new TableNotFoundException(msg);
1561        }
1562      } else {
1563        returnTables = monitorTargets;
1564      }
1565
1566      return returnTables;
1567    }
1568
1569    /*
1570     * Canary entry point to monitor all the tables.
1571     */
1572    private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink)
1573        throws Exception {
1574      LOG.debug("Reading list of tables");
1575      List<Future<Void>> taskFutures = new LinkedList<>();
1576      for (TableDescriptor td: admin.listTableDescriptors()) {
1577        if (admin.tableExists(td.getTableName()) && admin.isTableEnabled(td.getTableName()) &&
1578            (!td.getTableName().equals(writeTableName))) {
1579          LongAdder readLatency =
1580              regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString());
1581          taskFutures.addAll(CanaryTool.sniff(admin, sink, td, executor, taskType,
1582            this.rawScanEnabled, readLatency, readAllCF));
1583        }
1584      }
1585      return taskFutures;
1586    }
1587
1588    private void checkWriteTableDistribution() throws IOException {
1589      if (!admin.tableExists(writeTableName)) {
1590        int numberOfServers = admin.getRegionServers().size();
1591        if (numberOfServers == 0) {
1592          throw new IllegalStateException("No live regionservers");
1593        }
1594        createWriteTable(numberOfServers);
1595      }
1596
1597      if (!admin.isTableEnabled(writeTableName)) {
1598        admin.enableTable(writeTableName);
1599      }
1600
1601      ClusterMetrics status =
1602          admin.getClusterMetrics(EnumSet.of(Option.SERVERS_NAME, Option.MASTER));
1603      int numberOfServers = status.getServersName().size();
1604      if (status.getServersName().contains(status.getMasterName())) {
1605        numberOfServers -= 1;
1606      }
1607
1608      List<Pair<RegionInfo, ServerName>> pairs =
1609          MetaTableAccessor.getTableRegionsAndLocations(connection, writeTableName);
1610      int numberOfRegions = pairs.size();
1611      if (numberOfRegions < numberOfServers * regionsLowerLimit
1612          || numberOfRegions > numberOfServers * regionsUpperLimit) {
1613        admin.disableTable(writeTableName);
1614        admin.deleteTable(writeTableName);
1615        createWriteTable(numberOfServers);
1616      }
1617      HashSet<ServerName> serverSet = new HashSet<>();
1618      for (Pair<RegionInfo, ServerName> pair : pairs) {
1619        serverSet.add(pair.getSecond());
1620      }
1621      int numberOfCoveredServers = serverSet.size();
1622      if (numberOfCoveredServers < numberOfServers) {
1623        admin.balance();
1624      }
1625    }
1626
1627    private void createWriteTable(int numberOfServers) throws IOException {
1628      int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
1629      LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " +
1630        "(current lower limit of regions per server is {} and you can change it with config {}).",
1631          numberOfServers, numberOfRegions, regionsLowerLimit,
1632          HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY);
1633      HTableDescriptor desc = new HTableDescriptor(writeTableName);
1634      HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
1635      family.setMaxVersions(1);
1636      family.setTimeToLive(writeDataTTL);
1637
1638      desc.addFamily(family);
1639      byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions);
1640      admin.createTable(desc, splits);
1641    }
1642  }
1643
1644  /**
1645   * Canary entry point for specified table.
1646   * @throws Exception exception
1647   */
1648  private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName,
1649      ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency,
1650      boolean readAllCF) throws Exception {
1651    LOG.debug("Checking table is enabled and getting table descriptor for table {}", tableName);
1652    if (admin.isTableEnabled(TableName.valueOf(tableName))) {
1653      return CanaryTool.sniff(admin, sink, admin.getDescriptor(TableName.valueOf(tableName)),
1654        executor, taskType, rawScanEnabled, readLatency, readAllCF);
1655    } else {
1656      LOG.warn("Table {} is not enabled", tableName);
1657    }
1658    return new LinkedList<>();
1659  }
1660
1661  /*
1662   * Loops over regions of this table, and outputs information about the state.
1663   */
1664  private static List<Future<Void>> sniff(final Admin admin, final Sink sink,
1665      TableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
1666      boolean rawScanEnabled, LongAdder rwLatency, boolean readAllCF) throws Exception {
1667    LOG.debug("Reading list of regions for table {}", tableDesc.getTableName());
1668    try (Table table = admin.getConnection().getTable(tableDesc.getTableName())) {
1669      List<RegionTask> tasks = new ArrayList<>();
1670      try (RegionLocator regionLocator =
1671               admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
1672        for (HRegionLocation location: regionLocator.getAllRegionLocations()) {
1673          if (location == null) {
1674            LOG.warn("Null location");
1675            continue;
1676          }
1677          ServerName rs = location.getServerName();
1678          RegionInfo region = location.getRegion();
1679          tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink)sink,
1680              taskType, rawScanEnabled, rwLatency, readAllCF));
1681          Map<String, List<RegionTaskResult>> regionMap = ((RegionStdOutSink) sink).getRegionMap();
1682          regionMap.put(region.getRegionNameAsString(), new ArrayList<RegionTaskResult>());
1683        }
1684        return executor.invokeAll(tasks);
1685      }
1686    } catch (TableNotFoundException e) {
1687      return Collections.EMPTY_LIST;
1688    }
1689  }
1690
1691  //  monitor for zookeeper mode
1692  private static class ZookeeperMonitor extends Monitor {
1693    private List<String> hosts;
1694    private final String znode;
1695    private final int timeout;
1696
1697    protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
1698        Sink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures)  {
1699      super(connection, monitorTargets, useRegExp,
1700          sink, executor, treatFailureAsError, allowedFailures);
1701      Configuration configuration = connection.getConfiguration();
1702      znode =
1703          configuration.get(ZOOKEEPER_ZNODE_PARENT,
1704              DEFAULT_ZOOKEEPER_ZNODE_PARENT);
1705      timeout = configuration
1706          .getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT);
1707      ConnectStringParser parser =
1708          new ConnectStringParser(ZKConfig.getZKQuorumServersString(configuration));
1709      hosts = Lists.newArrayList();
1710      for (InetSocketAddress server : parser.getServerAddresses()) {
1711        hosts.add(server.toString());
1712      }
1713      if (allowedFailures > (hosts.size() - 1) / 2) {
1714        LOG.warn(
1715          "Confirm allowable number of failed ZooKeeper nodes, as quorum will "
1716              + "already be lost. Setting of {} failures is unexpected for {} ensemble size.",
1717          allowedFailures, hosts.size());
1718      }
1719    }
1720
1721    @Override public void run() {
1722      List<ZookeeperTask> tasks = Lists.newArrayList();
1723      ZookeeperStdOutSink zkSink = null;
1724      try {
1725        zkSink = this.getSink();
1726      } catch (RuntimeException e) {
1727        LOG.error("Run ZooKeeperMonitor failed!", e);
1728        this.errorCode = ERROR_EXIT_CODE;
1729      }
1730      this.initialized = true;
1731      for (final String host : hosts) {
1732        tasks.add(new ZookeeperTask(connection, host, znode, timeout, zkSink));
1733      }
1734      try {
1735        for (Future<Void> future : this.executor.invokeAll(tasks)) {
1736          try {
1737            future.get();
1738          } catch (ExecutionException e) {
1739            LOG.error("Sniff zookeeper failed!", e);
1740            this.errorCode = ERROR_EXIT_CODE;
1741          }
1742        }
1743      } catch (InterruptedException e) {
1744        this.errorCode = ERROR_EXIT_CODE;
1745        Thread.currentThread().interrupt();
1746        LOG.error("Sniff zookeeper interrupted!", e);
1747      }
1748      this.done = true;
1749    }
1750
1751    private ZookeeperStdOutSink getSink() {
1752      if (!(sink instanceof ZookeeperStdOutSink)) {
1753        throw new RuntimeException("Can only write to zookeeper sink");
1754      }
1755      return ((ZookeeperStdOutSink) sink);
1756    }
1757  }
1758
1759
1760  /**
1761   * A monitor for regionserver mode
1762   */
1763  private static class RegionServerMonitor extends Monitor {
1764    private boolean allRegions;
1765
1766    public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
1767        Sink sink, ExecutorService executor, boolean allRegions,
1768        boolean treatFailureAsError, long allowedFailures) {
1769      super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
1770          allowedFailures);
1771      this.allRegions = allRegions;
1772    }
1773
1774    private RegionServerStdOutSink getSink() {
1775      if (!(sink instanceof RegionServerStdOutSink)) {
1776        throw new RuntimeException("Can only write to regionserver sink");
1777      }
1778      return ((RegionServerStdOutSink) sink);
1779    }
1780
1781    @Override
1782    public void run() {
1783      if (this.initAdmin() && this.checkNoTableNames()) {
1784        RegionServerStdOutSink regionServerSink = null;
1785        try {
1786          regionServerSink = this.getSink();
1787        } catch (RuntimeException e) {
1788          LOG.error("Run RegionServerMonitor failed!", e);
1789          this.errorCode = ERROR_EXIT_CODE;
1790        }
1791        Map<String, List<RegionInfo>> rsAndRMap = this.filterRegionServerByName();
1792        this.initialized = true;
1793        this.monitorRegionServers(rsAndRMap, regionServerSink);
1794      }
1795      this.done = true;
1796    }
1797
1798    private boolean checkNoTableNames() {
1799      List<String> foundTableNames = new ArrayList<>();
1800      TableName[] tableNames = null;
1801      LOG.debug("Reading list of tables");
1802      try {
1803        tableNames = this.admin.listTableNames();
1804      } catch (IOException e) {
1805        LOG.error("Get listTableNames failed", e);
1806        this.errorCode = INIT_ERROR_EXIT_CODE;
1807        return false;
1808      }
1809
1810      if (this.targets == null || this.targets.length == 0) {
1811        return true;
1812      }
1813
1814      for (String target : this.targets) {
1815        for (TableName tableName : tableNames) {
1816          if (target.equals(tableName.getNameAsString())) {
1817            foundTableNames.add(target);
1818          }
1819        }
1820      }
1821
1822      if (foundTableNames.size() > 0) {
1823        System.err.println("Cannot pass a tablename when using the -regionserver " +
1824            "option, tablenames:" + foundTableNames.toString());
1825        this.errorCode = USAGE_EXIT_CODE;
1826      }
1827      return foundTableNames.isEmpty();
1828    }
1829
1830    private void monitorRegionServers(Map<String, List<RegionInfo>> rsAndRMap,
1831        RegionServerStdOutSink regionServerSink) {
1832      List<RegionServerTask> tasks = new ArrayList<>();
1833      Map<String, AtomicLong> successMap = new HashMap<>();
1834      Random rand = new Random();
1835      for (Map.Entry<String, List<RegionInfo>> entry : rsAndRMap.entrySet()) {
1836        String serverName = entry.getKey();
1837        AtomicLong successes = new AtomicLong(0);
1838        successMap.put(serverName, successes);
1839        if (entry.getValue().isEmpty()) {
1840          LOG.error("Regionserver not serving any regions - {}", serverName);
1841        } else if (this.allRegions) {
1842          for (RegionInfo region : entry.getValue()) {
1843            tasks.add(new RegionServerTask(this.connection,
1844                serverName,
1845                region,
1846                regionServerSink,
1847                successes));
1848          }
1849        } else {
1850          // random select a region if flag not set
1851          RegionInfo region = entry.getValue().get(rand.nextInt(entry.getValue().size()));
1852          tasks.add(new RegionServerTask(this.connection,
1853              serverName,
1854              region,
1855              regionServerSink,
1856              successes));
1857        }
1858      }
1859      try {
1860        for (Future<Void> future : this.executor.invokeAll(tasks)) {
1861          try {
1862            future.get();
1863          } catch (ExecutionException e) {
1864            LOG.error("Sniff regionserver failed!", e);
1865            this.errorCode = ERROR_EXIT_CODE;
1866          }
1867        }
1868        if (this.allRegions) {
1869          for (Map.Entry<String, List<RegionInfo>> entry : rsAndRMap.entrySet()) {
1870            String serverName = entry.getKey();
1871            LOG.info("Successfully read {} regions out of {} on regionserver {}",
1872                successMap.get(serverName), entry.getValue().size(), serverName);
1873          }
1874        }
1875      } catch (InterruptedException e) {
1876        this.errorCode = ERROR_EXIT_CODE;
1877        LOG.error("Sniff regionserver interrupted!", e);
1878      }
1879    }
1880
1881    private Map<String, List<RegionInfo>> filterRegionServerByName() {
1882      Map<String, List<RegionInfo>> regionServerAndRegionsMap = this.getAllRegionServerByName();
1883      regionServerAndRegionsMap = this.doFilterRegionServerByName(regionServerAndRegionsMap);
1884      return regionServerAndRegionsMap;
1885    }
1886
1887    private Map<String, List<RegionInfo>> getAllRegionServerByName() {
1888      Map<String, List<RegionInfo>> rsAndRMap = new HashMap<>();
1889      try {
1890        LOG.debug("Reading list of tables and locations");
1891        List<TableDescriptor> tableDescs = this.admin.listTableDescriptors();
1892        List<RegionInfo> regions = null;
1893        for (TableDescriptor tableDesc: tableDescs) {
1894          try (RegionLocator regionLocator =
1895                   this.admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
1896            for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
1897              if (location == null) {
1898                LOG.warn("Null location");
1899                continue;
1900              }
1901              ServerName rs = location.getServerName();
1902              String rsName = rs.getHostname();
1903              RegionInfo r = location.getRegion();
1904              if (rsAndRMap.containsKey(rsName)) {
1905                regions = rsAndRMap.get(rsName);
1906              } else {
1907                regions = new ArrayList<>();
1908                rsAndRMap.put(rsName, regions);
1909              }
1910              regions.add(r);
1911            }
1912          }
1913        }
1914
1915        // get any live regionservers not serving any regions
1916        for (ServerName rs: this.admin.getRegionServers()) {
1917          String rsName = rs.getHostname();
1918          if (!rsAndRMap.containsKey(rsName)) {
1919            rsAndRMap.put(rsName, Collections.<RegionInfo> emptyList());
1920          }
1921        }
1922      } catch (IOException e) {
1923        LOG.error("Get HTables info failed", e);
1924        this.errorCode = INIT_ERROR_EXIT_CODE;
1925      }
1926      return rsAndRMap;
1927    }
1928
1929    private Map<String, List<RegionInfo>> doFilterRegionServerByName(
1930        Map<String, List<RegionInfo>> fullRsAndRMap) {
1931
1932      Map<String, List<RegionInfo>> filteredRsAndRMap = null;
1933
1934      if (this.targets != null && this.targets.length > 0) {
1935        filteredRsAndRMap = new HashMap<>();
1936        Pattern pattern = null;
1937        Matcher matcher = null;
1938        boolean regExpFound = false;
1939        for (String rsName : this.targets) {
1940          if (this.useRegExp) {
1941            regExpFound = false;
1942            pattern = Pattern.compile(rsName);
1943            for (Map.Entry<String, List<RegionInfo>> entry : fullRsAndRMap.entrySet()) {
1944              matcher = pattern.matcher(entry.getKey());
1945              if (matcher.matches()) {
1946                filteredRsAndRMap.put(entry.getKey(), entry.getValue());
1947                regExpFound = true;
1948              }
1949            }
1950            if (!regExpFound) {
1951              LOG.info("No RegionServerInfo found, regionServerPattern {}", rsName);
1952            }
1953          } else {
1954            if (fullRsAndRMap.containsKey(rsName)) {
1955              filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
1956            } else {
1957              LOG.info("No RegionServerInfo found, regionServerName {}", rsName);
1958            }
1959          }
1960        }
1961      } else {
1962        filteredRsAndRMap = fullRsAndRMap;
1963      }
1964      return filteredRsAndRMap;
1965    }
1966  }
1967
1968  public static void main(String[] args) throws Exception {
1969    final Configuration conf = HBaseConfiguration.create();
1970
1971    int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
1972    LOG.info("Execution thread count={}", numThreads);
1973
1974    int exitCode;
1975    ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
1976    try {
1977      exitCode = ToolRunner.run(conf, new CanaryTool(executor), args);
1978    } finally {
1979      executor.shutdown();
1980    }
1981    System.exit(exitCode);
1982  }
1983}