001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022import org.apache.hadoop.conf.Configurable;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.client.RegionInfoBuilder;
025import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
026import org.apache.hadoop.hbase.util.Threads;
027import org.apache.yetus.audience.InterfaceAudience;
028import org.apache.yetus.audience.InterfaceStability;
029
030/**
031 * This class defines methods that can help with managing HBase clusters from unit tests and system
032 * tests. There are 3 types of cluster deployments:
033 * <ul>
034 * <li><b>SingleProcessHBaseCluster:</b> each server is run in the same JVM in separate threads,
035 * used by unit tests</li>
036 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
037 * interact with the cluster.</li>
038 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate JVMs.
039 * </li>
040 * </ul>
041 * <p>
042 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can be run
043 * against a mini-cluster during unit test execution, or a distributed cluster having tens/hundreds
044 * of nodes during execution of integration tests.
045 * <p>
046 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
047 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, and
048 * some tests will still need to mock stuff and introspect internal state. For those use cases from
049 * unit tests, or if more control is needed, you can use the subclasses directly. In that sense,
050 * this class does not abstract away <strong>every</strong> interface that SingleProcessHBaseCluster
051 * or DistributedHBaseCluster provide.
052 */
053@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.PHOENIX)
054@InterfaceStability.Evolving
055public abstract class HBaseClusterInterface implements Closeable, Configurable {
056
057  protected Configuration conf;
058
059  /** the status of the cluster before we begin */
060  protected ClusterMetrics initialClusterStatus;
061
062  /**
063   * Construct an HBaseCluster
064   * @param conf Configuration to be used for cluster
065   */
066  public HBaseClusterInterface(Configuration conf) {
067    setConf(conf);
068  }
069
070  @Override
071  public void setConf(Configuration conf) {
072    this.conf = conf;
073  }
074
075  @Override
076  public Configuration getConf() {
077    return conf;
078  }
079
080  /**
081   * Returns a ClusterMetrics for this HBase cluster.
082   * @see #getInitialClusterMetrics()
083   */
084  public abstract ClusterMetrics getClusterMetrics() throws IOException;
085
086  /**
087   * Returns a ClusterStatus for this HBase cluster as observed at the starting of the HBaseCluster
088   */
089  public ClusterMetrics getInitialClusterMetrics() throws IOException {
090    return initialClusterStatus;
091  }
092
093  /**
094   * Starts a new region server on the given hostname or if this is a mini/local cluster, starts a
095   * region server locally.
096   * @param hostname the hostname to start the regionserver on
097   * @throws IOException if something goes wrong
098   */
099  public abstract void startRegionServer(String hostname, int port) throws IOException;
100
101  /**
102   * Kills the region server process if this is a distributed cluster, otherwise this causes the
103   * region server to exit doing basic clean up only.
104   * @throws IOException if something goes wrong
105   */
106  public abstract void killRegionServer(ServerName serverName) throws IOException;
107
108  /**
109   * Keeping track of killed servers and being able to check if a particular server was killed makes
110   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
111   * example of such case is - killing servers and waiting for all regions of a particular table to
112   * be assigned. We can check for server column in META table and that its value is not one of the
113   * killed servers.
114   */
115  public abstract boolean isKilledRS(ServerName serverName);
116
117  /**
118   * Stops the given region server, by attempting a gradual stop.
119   * @throws IOException if something goes wrong
120   */
121  public abstract void stopRegionServer(ServerName serverName) throws IOException;
122
123  /**
124   * Wait for the specified region server to join the cluster
125   * @throws IOException if something goes wrong or timeout occurs
126   */
127  public void waitForRegionServerToStart(String hostname, int port, long timeout)
128    throws IOException {
129    long start = EnvironmentEdgeManager.currentTime();
130    while ((EnvironmentEdgeManager.currentTime() - start) < timeout) {
131      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
132        if (server.getHostname().equals(hostname) && server.getPort() == port) {
133          return;
134        }
135      }
136      Threads.sleep(100);
137    }
138    throw new IOException(
139      "did timeout " + timeout + "ms waiting for region server to start: " + hostname);
140  }
141
142  /**
143   * Wait for the specified region server to stop the thread / process.
144   * @throws IOException if something goes wrong or timeout occurs
145   */
146  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
147    throws IOException;
148
149  /**
150   * Suspend the region server
151   * @param serverName the hostname to suspend the regionserver on
152   * @throws IOException if something goes wrong
153   */
154  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
155
156  /**
157   * Resume the region server
158   * @param serverName the hostname to resume the regionserver on
159   * @throws IOException if something goes wrong
160   */
161  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
162
163  /**
164   * Wait for the specified region server to suspend the thread / process.
165   * @throws IOException if something goes wrong or timeout occurs
166   */
167  public abstract void waitForRegionServerToSuspend(ServerName serverName, long timeout)
168    throws IOException;
169
170  /**
171   * Wait for the specified region server to resume the thread / process.
172   * @throws IOException if something goes wrong or timeout occurs
173   */
174  public abstract void waitForRegionServerToResume(ServerName serverName, long timeout)
175    throws IOException;
176
177  /**
178   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, silently
179   * logs warning message.
180   * @param hostname the hostname to start the regionserver on
181   * @throws IOException if something goes wrong
182   */
183  public abstract void startZkNode(String hostname, int port) throws IOException;
184
185  /**
186   * Kills the zookeeper node process if this is a distributed cluster, otherwise, this causes
187   * master to exit doing basic clean up only.
188   * @throws IOException if something goes wrong
189   */
190  public abstract void killZkNode(ServerName serverName) throws IOException;
191
192  /**
193   * Stops the region zookeeper if this is a distributed cluster, otherwise silently logs warning
194   * message.
195   * @throws IOException if something goes wrong
196   */
197  public abstract void stopZkNode(ServerName serverName) throws IOException;
198
199  /**
200   * Wait for the specified zookeeper node to join the cluster
201   * @throws IOException if something goes wrong or timeout occurs
202   */
203  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException;
204
205  /**
206   * Wait for the specified zookeeper node to stop the thread / process.
207   * @throws IOException if something goes wrong or timeout occurs
208   */
209  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException;
210
211  /**
212   * Starts a new datanode on the given hostname or if this is a mini/local cluster, silently logs
213   * warning message.
214   * @throws IOException if something goes wrong
215   */
216  public abstract void startDataNode(ServerName serverName) throws IOException;
217
218  /**
219   * Kills the datanode process if this is a distributed cluster, otherwise, this causes master to
220   * exit doing basic clean up only.
221   * @throws IOException if something goes wrong
222   */
223  public abstract void killDataNode(ServerName serverName) throws IOException;
224
225  /**
226   * Stops the datanode if this is a distributed cluster, otherwise silently logs warning message.
227   * @throws IOException if something goes wrong
228   */
229  public abstract void stopDataNode(ServerName serverName) throws IOException;
230
231  /**
232   * Wait for the specified datanode to join the cluster
233   * @throws IOException if something goes wrong or timeout occurs
234   */
235  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
236    throws IOException;
237
238  /**
239   * Wait for the specified datanode to stop the thread / process.
240   * @throws IOException if something goes wrong or timeout occurs
241   */
242  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
243    throws IOException;
244
245  /**
246   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
247   * warning message.
248   * @throws IOException if something goes wrong
249   */
250  public abstract void startNameNode(ServerName serverName) throws IOException;
251
252  /**
253   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
254   * exit doing basic clean up only.
255   * @throws IOException if something goes wrong
256   */
257  public abstract void killNameNode(ServerName serverName) throws IOException;
258
259  /**
260   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
261   * @throws IOException if something goes wrong
262   */
263  public abstract void stopNameNode(ServerName serverName) throws IOException;
264
265  /**
266   * Wait for the specified namenode to join the cluster
267   * @throws IOException if something goes wrong or timeout occurs
268   */
269  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
270    throws IOException;
271
272  /**
273   * Wait for the specified namenode to stop
274   * @throws IOException if something goes wrong or timeout occurs
275   */
276  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
277    throws IOException;
278
279  /**
280   * Starts a new journalnode on the given hostname or if this is a mini/local cluster, silently
281   * logs warning message.
282   * @throws IOException if something goes wrong
283   */
284  public abstract void startJournalNode(ServerName serverName) throws IOException;
285
286  /**
287   * Kills the journalnode process if this is a distributed cluster, otherwise, this causes master
288   * to exit doing basic clean up only.
289   * @throws IOException if something goes wrong
290   */
291  public abstract void killJournalNode(ServerName serverName) throws IOException;
292
293  /**
294   * Stops the journalnode if this is a distributed cluster, otherwise silently logs warning
295   * message.
296   * @throws IOException if something goes wrong
297   */
298  public abstract void stopJournalNode(ServerName serverName) throws IOException;
299
300  /**
301   * Wait for the specified journalnode to join the cluster
302   * @throws IOException if something goes wrong or timeout occurs
303   */
304  public abstract void waitForJournalNodeToStart(ServerName serverName, long timeout)
305    throws IOException;
306
307  /**
308   * Wait for the specified journalnode to stop
309   * @throws IOException if something goes wrong or timeout occurs
310   */
311  public abstract void waitForJournalNodeToStop(ServerName serverName, long timeout)
312    throws IOException;
313
314  /**
315   * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
316   * locally.
317   * @param hostname the hostname to start the master on
318   * @throws IOException if something goes wrong
319   */
320  public abstract void startMaster(String hostname, int port) throws IOException;
321
322  /**
323   * Kills the master process if this is a distributed cluster, otherwise, this causes master to
324   * exit doing basic clean up only.
325   * @throws IOException if something goes wrong
326   */
327  public abstract void killMaster(ServerName serverName) throws IOException;
328
329  /**
330   * Stops the given master, by attempting a gradual stop.
331   * @throws IOException if something goes wrong
332   */
333  public abstract void stopMaster(ServerName serverName) throws IOException;
334
335  /**
336   * Wait for the specified master to stop the thread / process.
337   * @throws IOException if something goes wrong or timeout occurs
338   */
339  public abstract void waitForMasterToStop(ServerName serverName, long timeout) throws IOException;
340
341  /**
342   * Blocks until there is an active master and that master has completed initialization.
343   * @return true if an active master becomes available. false if there are no masters left.
344   * @throws IOException if something goes wrong or timeout occurs
345   */
346  public boolean waitForActiveAndReadyMaster() throws IOException {
347    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
348  }
349
350  /**
351   * Blocks until there is an active master and that master has completed initialization.
352   * @param timeout the timeout limit in ms
353   * @return true if an active master becomes available. false if there are no masters left.
354   */
355  public abstract boolean waitForActiveAndReadyMaster(long timeout) throws IOException;
356
357  /**
358   * Wait for HBase Cluster to shut down.
359   */
360  public abstract void waitUntilShutDown() throws IOException;
361
362  /**
363   * Shut down the HBase cluster
364   */
365  public abstract void shutdown() throws IOException;
366
367  /**
368   * Restores the cluster to it's initial state if this is a real cluster, otherwise does nothing.
369   * This is a best effort restore. If the servers are not reachable, or insufficient permissions,
370   * etc. restoration might be partial.
371   * @return whether restoration is complete
372   */
373  public boolean restoreInitialStatus() throws IOException {
374    return restoreClusterMetrics(getInitialClusterMetrics());
375  }
376
377  /**
378   * Restores the cluster to given state if this is a real cluster, otherwise does nothing. This is
379   * a best effort restore. If the servers are not reachable, or insufficient permissions, etc.
380   * restoration might be partial.
381   * @return whether restoration is complete
382   */
383  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
384    return true;
385  }
386
387  /**
388   * Get the ServerName of region server serving the first hbase:meta region
389   */
390  public ServerName getServerHoldingMeta() throws IOException {
391    return getServerHoldingRegion(TableName.META_TABLE_NAME,
392      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
393  }
394
395  /**
396   * Get the ServerName of region server serving the specified region
397   * @param regionName Name of the region in bytes
398   * @param tn         Table name that has the region.
399   * @return ServerName that hosts the region or null
400   */
401  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
402    throws IOException;
403
404  /**
405   * @return whether we are interacting with a distributed cluster as opposed to an in-process
406   *         mini/local cluster.
407   */
408  public boolean isDistributedCluster() {
409    return false;
410  }
411
412  /**
413   * Closes all the resources held open for this cluster. Note that this call does not shutdown the
414   * cluster.
415   * @see #shutdown()
416   */
417  @Override
418  public abstract void close() throws IOException;
419
420  /**
421   * Wait for the namenode.
422   */
423  public void waitForNamenodeAvailable() throws InterruptedException {
424  }
425
426  public void waitForDatanodesRegistered(int nbDN) throws Exception {
427  }
428}